You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1020 lines
41 KiB
Plaintext

6 months ago
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "62905e89-ec59-4c77-a045-890061dbc45f",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "978571c2-aaaf-46f9-a355-da5ded8aa14b",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"Melbourne_housing_FULL.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "72a45ce1-95c4-4de9-a112-26ce25b0ef63",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',\n",
" 'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',\n",
" 'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',\n",
" 'Longtitude', 'Regionname', 'Propertycount'],\n",
" dtype='object')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c8559c7e-59ab-43e7-940a-c0eafe1fdb38",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Suburb</th>\n",
" <th>Address</th>\n",
" <th>Rooms</th>\n",
" <th>Type</th>\n",
" <th>Price</th>\n",
" <th>Method</th>\n",
" <th>SellerG</th>\n",
" <th>Date</th>\n",
" <th>Distance</th>\n",
" <th>Postcode</th>\n",
" <th>...</th>\n",
" <th>Bathroom</th>\n",
" <th>Car</th>\n",
" <th>Landsize</th>\n",
" <th>BuildingArea</th>\n",
" <th>YearBuilt</th>\n",
" <th>CouncilArea</th>\n",
" <th>Lattitude</th>\n",
" <th>Longtitude</th>\n",
" <th>Regionname</th>\n",
" <th>Propertycount</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Abbotsford</td>\n",
" <td>68 Studley St</td>\n",
" <td>2</td>\n",
" <td>h</td>\n",
" <td>NaN</td>\n",
" <td>SS</td>\n",
" <td>Jellis</td>\n",
" <td>3/09/2016</td>\n",
" <td>2.5</td>\n",
" <td>3067.0</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>126.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Yarra City Council</td>\n",
" <td>-37.8014</td>\n",
" <td>144.9958</td>\n",
" <td>Northern Metropolitan</td>\n",
" <td>4019.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Abbotsford</td>\n",
" <td>85 Turner St</td>\n",
" <td>2</td>\n",
" <td>h</td>\n",
" <td>1480000.0</td>\n",
" <td>S</td>\n",
" <td>Biggin</td>\n",
" <td>3/12/2016</td>\n",
" <td>2.5</td>\n",
" <td>3067.0</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>202.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Yarra City Council</td>\n",
" <td>-37.7996</td>\n",
" <td>144.9984</td>\n",
" <td>Northern Metropolitan</td>\n",
" <td>4019.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Abbotsford</td>\n",
" <td>25 Bloomburg St</td>\n",
" <td>2</td>\n",
" <td>h</td>\n",
" <td>1035000.0</td>\n",
" <td>S</td>\n",
" <td>Biggin</td>\n",
" <td>4/02/2016</td>\n",
" <td>2.5</td>\n",
" <td>3067.0</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>156.0</td>\n",
" <td>79.0</td>\n",
" <td>1900.0</td>\n",
" <td>Yarra City Council</td>\n",
" <td>-37.8079</td>\n",
" <td>144.9934</td>\n",
" <td>Northern Metropolitan</td>\n",
" <td>4019.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Abbotsford</td>\n",
" <td>18/659 Victoria St</td>\n",
" <td>3</td>\n",
" <td>u</td>\n",
" <td>NaN</td>\n",
" <td>VB</td>\n",
" <td>Rounds</td>\n",
" <td>4/02/2016</td>\n",
" <td>2.5</td>\n",
" <td>3067.0</td>\n",
" <td>...</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Yarra City Council</td>\n",
" <td>-37.8114</td>\n",
" <td>145.0116</td>\n",
" <td>Northern Metropolitan</td>\n",
" <td>4019.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Abbotsford</td>\n",
" <td>5 Charles St</td>\n",
" <td>3</td>\n",
" <td>h</td>\n",
" <td>1465000.0</td>\n",
" <td>SP</td>\n",
" <td>Biggin</td>\n",
" <td>4/03/2017</td>\n",
" <td>2.5</td>\n",
" <td>3067.0</td>\n",
" <td>...</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>134.0</td>\n",
" <td>150.0</td>\n",
" <td>1900.0</td>\n",
" <td>Yarra City Council</td>\n",
" <td>-37.8093</td>\n",
" <td>144.9944</td>\n",
" <td>Northern Metropolitan</td>\n",
" <td>4019.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" Suburb Address Rooms Type Price Method SellerG \\\n",
"0 Abbotsford 68 Studley St 2 h NaN SS Jellis \n",
"1 Abbotsford 85 Turner St 2 h 1480000.0 S Biggin \n",
"2 Abbotsford 25 Bloomburg St 2 h 1035000.0 S Biggin \n",
"3 Abbotsford 18/659 Victoria St 3 u NaN VB Rounds \n",
"4 Abbotsford 5 Charles St 3 h 1465000.0 SP Biggin \n",
"\n",
" Date Distance Postcode ... Bathroom Car Landsize BuildingArea \\\n",
"0 3/09/2016 2.5 3067.0 ... 1.0 1.0 126.0 NaN \n",
"1 3/12/2016 2.5 3067.0 ... 1.0 1.0 202.0 NaN \n",
"2 4/02/2016 2.5 3067.0 ... 1.0 0.0 156.0 79.0 \n",
"3 4/02/2016 2.5 3067.0 ... 2.0 1.0 0.0 NaN \n",
"4 4/03/2017 2.5 3067.0 ... 2.0 0.0 134.0 150.0 \n",
"\n",
" YearBuilt CouncilArea Lattitude Longtitude Regionname \\\n",
"0 NaN Yarra City Council -37.8014 144.9958 Northern Metropolitan \n",
"1 NaN Yarra City Council -37.7996 144.9984 Northern Metropolitan \n",
"2 1900.0 Yarra City Council -37.8079 144.9934 Northern Metropolitan \n",
"3 NaN Yarra City Council -37.8114 145.0116 Northern Metropolitan \n",
"4 1900.0 Yarra City Council -37.8093 144.9944 Northern Metropolitan \n",
"\n",
" Propertycount \n",
"0 4019.0 \n",
"1 4019.0 \n",
"2 4019.0 \n",
"3 4019.0 \n",
"4 4019.0 \n",
"\n",
"[5 rows x 21 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "094c00f8-7409-4890-ba4c-75b361e2a10f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 34857 entries, 0 to 34856\n",
"Data columns (total 21 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Suburb 34857 non-null object \n",
" 1 Address 34857 non-null object \n",
" 2 Rooms 34857 non-null int64 \n",
" 3 Type 34857 non-null object \n",
" 4 Price 27247 non-null float64\n",
" 5 Method 34857 non-null object \n",
" 6 SellerG 34857 non-null object \n",
" 7 Date 34857 non-null object \n",
" 8 Distance 34856 non-null float64\n",
" 9 Postcode 34856 non-null float64\n",
" 10 Bedroom2 26640 non-null float64\n",
" 11 Bathroom 26631 non-null float64\n",
" 12 Car 26129 non-null float64\n",
" 13 Landsize 23047 non-null float64\n",
" 14 BuildingArea 13742 non-null float64\n",
" 15 YearBuilt 15551 non-null float64\n",
" 16 CouncilArea 34854 non-null object \n",
" 17 Lattitude 26881 non-null float64\n",
" 18 Longtitude 26881 non-null float64\n",
" 19 Regionname 34854 non-null object \n",
" 20 Propertycount 34854 non-null float64\n",
"dtypes: float64(12), int64(1), object(8)\n",
"memory usage: 5.6+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "f38712ee-05f6-4e84-869f-3daa9c1dfc1e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Suburb 351\n",
"Address 34009\n",
"Rooms 12\n",
"Type 3\n",
"Price 2871\n",
"Method 9\n",
"SellerG 388\n",
"Date 78\n",
"Distance 215\n",
"Postcode 211\n",
"Bedroom2 15\n",
"Bathroom 11\n",
"Car 15\n",
"Landsize 1684\n",
"BuildingArea 740\n",
"YearBuilt 160\n",
"CouncilArea 33\n",
"Lattitude 13402\n",
"Longtitude 14524\n",
"Regionname 8\n",
"Propertycount 342\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.nunique()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "40f7c561-4daa-48d5-a956-827f121c844e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(34857, 21)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "4de8ee87-e2de-4af3-a743-955b04703a0c",
"metadata": {},
"outputs": [],
"source": [
"cols = [\"Suburb\", \"Rooms\", \"Type\", \"Method\", \"SellerG\", \"Regionname\",\n",
" \"Propertycount\", \"Distance\", \"CouncilArea\", \"Bedroom2\", \"Bathroom\",\n",
" \"Car\", \"Landsize\", \"BuildingArea\", \"Price\"]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "b4293031-9acf-4f1b-8c3c-557d7e9f5147",
"metadata": {},
"outputs": [],
"source": [
"dataset = df[cols]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "cca9b80b-6f14-453e-86de-d322c2e5ccc0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(34857, 15)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.shape"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "efff86a4-99cb-4226-b38a-38f60782c752",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Suburb 0\n",
"Rooms 0\n",
"Type 0\n",
"Method 0\n",
"SellerG 0\n",
"Regionname 3\n",
"Propertycount 3\n",
"Distance 1\n",
"CouncilArea 3\n",
"Bedroom2 8217\n",
"Bathroom 8226\n",
"Car 8728\n",
"Landsize 11810\n",
"BuildingArea 21115\n",
"Price 7610\n",
"dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "e95473e0-4ea4-46cc-bdcd-a985da606fc1",
"metadata": {},
"outputs": [],
"source": [
"cols_fill_zero = [\"Propertycount\", \"Distance\", \"Bedroom2\", \"Bathroom\", \"Car\"]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "b8d2b695-7d88-4814-8798-f58192b757a9",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Stark-lin\\AppData\\Local\\Temp\\ipykernel_17120\\3566706403.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" dataset[cols_fill_zero] = dataset[cols_fill_zero].fillna(0)\n"
]
}
],
"source": [
"dataset[cols_fill_zero] = dataset[cols_fill_zero].fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "00af7e86-109d-4dc4-adc7-0a6bc33308aa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Suburb 0\n",
"Rooms 0\n",
"Type 0\n",
"Method 0\n",
"SellerG 0\n",
"Regionname 3\n",
"Propertycount 0\n",
"Distance 0\n",
"CouncilArea 3\n",
"Bedroom2 0\n",
"Bathroom 0\n",
"Car 0\n",
"Landsize 11810\n",
"BuildingArea 21115\n",
"Price 7610\n",
"dtype: int64"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "b398a29f-58a4-44d1-9cc2-c3c881fe9a8b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Stark-lin\\AppData\\Local\\Temp\\ipykernel_17120\\680991224.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" dataset[\"Landsize\"] = dataset[\"Landsize\"].fillna(dataset.Landsize.mean())\n",
"C:\\Users\\Stark-lin\\AppData\\Local\\Temp\\ipykernel_17120\\680991224.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" dataset[\"BuildingArea\"] = dataset[\"BuildingArea\"].fillna(dataset.BuildingArea.mean())\n"
]
}
],
"source": [
"dataset[\"Landsize\"] = dataset[\"Landsize\"].fillna(dataset.Landsize.mean())\n",
"dataset[\"BuildingArea\"] = dataset[\"BuildingArea\"].fillna(dataset.BuildingArea.mean())"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "0943b543-02c3-4532-bef3-527e1e12f4ad",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Suburb 0\n",
"Rooms 0\n",
"Type 0\n",
"Method 0\n",
"SellerG 0\n",
"Regionname 3\n",
"Propertycount 0\n",
"Distance 0\n",
"CouncilArea 3\n",
"Bedroom2 0\n",
"Bathroom 0\n",
"Car 0\n",
"Landsize 0\n",
"BuildingArea 0\n",
"Price 7610\n",
"dtype: int64"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "beb8bf99-aad3-433a-8e55-2bbbd47bf3ad",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Stark-lin\\AppData\\Local\\Temp\\ipykernel_17120\\2639109627.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" dataset.dropna(inplace=True)\n"
]
}
],
"source": [
"dataset.dropna(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "c3ea83e3-077d-423b-91c9-dcbc1ad1fb44",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Suburb 0\n",
"Rooms 0\n",
"Type 0\n",
"Method 0\n",
"SellerG 0\n",
"Regionname 0\n",
"Propertycount 0\n",
"Distance 0\n",
"CouncilArea 0\n",
"Bedroom2 0\n",
"Bathroom 0\n",
"Car 0\n",
"Landsize 0\n",
"BuildingArea 0\n",
"Price 0\n",
"dtype: int64"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "30b78704-17f5-48e3-ac10-1467daef8c0c",
"metadata": {},
"outputs": [],
"source": [
"dataset = pd.get_dummies(dataset, drop_first=True)#drop_first避免虚拟变量陷进"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "b047f7d8-2625-406c-81c9-77a027a484f9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(27244, 745)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.shape"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "8119b464-3619-4164-b21b-12e954d7d456",
"metadata": {},
"outputs": [],
"source": [
"x = dataset.drop(\"Price\", axis=1)\n",
"y = dataset[\"Price\"]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "84fb0433-1802-4c90-95cc-bb4138592661",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "1c54e05b-1931-438a-8d2e-4553f457ffb3",
"metadata": {},
"outputs": [],
"source": [
"x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "46623822-6158-4345-abb0-0ab6a3788dfd",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import LinearRegression"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "72752dc5-aaf3-45f6-80aa-7e232f754fbe",
"metadata": {},
"outputs": [],
"source": [
"lr = LinearRegression()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "275f1536-463c-45cb-868f-d1547a98e033",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-r
],
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lr.fit(x_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "5bb0d7ce-1e32-4e4b-a2c6-cc66ecd98744",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6737548533713945"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lr.score(x_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "c754b39b-a4a1-4685-9b36-22ba204c1028",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6797993843396896"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lr.score(x_train, y_train)"
]
},
{
"cell_type": "markdown",
"id": "a24ee128-32db-499a-b1d6-e5fa7de1f2e4",
"metadata": {},
"source": [
"$$(1 / (2 * n\\_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1$$"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "1dc6e7f7-47d0-4c80-b922-c8d44b258fb1",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import Lasso"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "ae939a27-a28b-420a-97e6-48bb220bfcf8",
"metadata": {},
"outputs": [],
"source": [
"l1_lr = Lasso(alpha=0.1, max_iter=1000, tol=0.0001)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "b405de3e-2e37-4f96-840b-999568cadf8a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\envs\\stark\\lib\\site-packages\\sklearn\\linear_model\\_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.476e+15, tolerance: 9.217e+11\n",
" model = cd_fast.enet_coordinate_descent(\n"
]
},
{
"data": {
"text/html": [
"<style>#sk-container-id-2 {color: black;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-r
],
"text/plain": [
"Lasso(alpha=0.1)"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"l1_lr.fit(x_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "93007622-75c8-4f3e-93ac-62354494fe32",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6797973052867211"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"l1_lr.score(x_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "4aed753a-f772-4cda-b304-65cc0d6ad7db",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6737830516738357"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"l1_lr.score(x_test, y_test)"
]
},
{
"cell_type": "markdown",
"id": "3e89bd7d-0a48-44be-b085-3188d8ad9e40",
"metadata": {},
"source": [
"$$||y - Xw||^2_2 + alpha * ||w||^2_2$$"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "0c98d7c3-3d65-47d6-a8c4-3b144574496b",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import Ridge"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "cc333c83-6cb3-4b8e-95bb-6d3053092fab",
"metadata": {},
"outputs": [],
"source": [
"l2_lr = Ridge(alpha=0.1, max_iter=1000, tol=0.0001)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "4cc4a3e2-8012-43ee-842a-863db51a436f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-3 {color: black;}#sk-container-id-3 pre{padding: 0;}#sk-container-id-3 div.sk-toggleable {background-color: white;}#sk-container-id-3 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-3 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-3 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-3 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-3 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-3 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-3 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-3 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-3 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-3 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-3 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-3 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-3 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-3 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-3 div.sk-item {position: relative;z-index: 1;}#sk-container-id-3 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-3 div.sk-item::before, #sk-container-id-3 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-3 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-3 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-3 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-3 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-3 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-3 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-3 div.sk-label-container {text-align: center;}#sk-container-id-3 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-3 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-3\" class=\"sk-top-container\"><div class=\"sk-text-r
],
"text/plain": [
"Ridge(alpha=0.1, max_iter=1000)"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"l2_lr.fit(x_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "6f3e1601-dce2-4e2c-9f55-82671c359d78",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6797776548635094"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"l2_lr.score(x_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "e1fb30ae-4cf2-4c54-be50-80aa515f4149",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6741118456030124"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"l2_lr.score(x_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "247e0a14-2234-4853-aa85-bb3ccdc403b4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}