{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "62905e89-ec59-4c77-a045-890061dbc45f", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "978571c2-aaaf-46f9-a355-da5ded8aa14b", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"Melbourne_housing_FULL.csv\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "72a45ce1-95c4-4de9-a112-26ce25b0ef63", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',\n", " 'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',\n", " 'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',\n", " 'Longtitude', 'Regionname', 'Propertycount'],\n", " dtype='object')" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": 4, "id": "c8559c7e-59ab-43e7-940a-c0eafe1fdb38", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SuburbAddressRoomsTypePriceMethodSellerGDateDistancePostcode...BathroomCarLandsizeBuildingAreaYearBuiltCouncilAreaLattitudeLongtitudeRegionnamePropertycount
0Abbotsford68 Studley St2hNaNSSJellis3/09/20162.53067.0...1.01.0126.0NaNNaNYarra City Council-37.8014144.9958Northern Metropolitan4019.0
1Abbotsford85 Turner St2h1480000.0SBiggin3/12/20162.53067.0...1.01.0202.0NaNNaNYarra City Council-37.7996144.9984Northern Metropolitan4019.0
2Abbotsford25 Bloomburg St2h1035000.0SBiggin4/02/20162.53067.0...1.00.0156.079.01900.0Yarra City Council-37.8079144.9934Northern Metropolitan4019.0
3Abbotsford18/659 Victoria St3uNaNVBRounds4/02/20162.53067.0...2.01.00.0NaNNaNYarra City Council-37.8114145.0116Northern Metropolitan4019.0
4Abbotsford5 Charles St3h1465000.0SPBiggin4/03/20172.53067.0...2.00.0134.0150.01900.0Yarra City Council-37.8093144.9944Northern Metropolitan4019.0
\n", "

5 rows × 21 columns

\n", "
" ], "text/plain": [ " Suburb Address Rooms Type Price Method SellerG \\\n", "0 Abbotsford 68 Studley St 2 h NaN SS Jellis \n", "1 Abbotsford 85 Turner St 2 h 1480000.0 S Biggin \n", "2 Abbotsford 25 Bloomburg St 2 h 1035000.0 S Biggin \n", "3 Abbotsford 18/659 Victoria St 3 u NaN VB Rounds \n", "4 Abbotsford 5 Charles St 3 h 1465000.0 SP Biggin \n", "\n", " Date Distance Postcode ... Bathroom Car Landsize BuildingArea \\\n", "0 3/09/2016 2.5 3067.0 ... 1.0 1.0 126.0 NaN \n", "1 3/12/2016 2.5 3067.0 ... 1.0 1.0 202.0 NaN \n", "2 4/02/2016 2.5 3067.0 ... 1.0 0.0 156.0 79.0 \n", "3 4/02/2016 2.5 3067.0 ... 2.0 1.0 0.0 NaN \n", "4 4/03/2017 2.5 3067.0 ... 2.0 0.0 134.0 150.0 \n", "\n", " YearBuilt CouncilArea Lattitude Longtitude Regionname \\\n", "0 NaN Yarra City Council -37.8014 144.9958 Northern Metropolitan \n", "1 NaN Yarra City Council -37.7996 144.9984 Northern Metropolitan \n", "2 1900.0 Yarra City Council -37.8079 144.9934 Northern Metropolitan \n", "3 NaN Yarra City Council -37.8114 145.0116 Northern Metropolitan \n", "4 1900.0 Yarra City Council -37.8093 144.9944 Northern Metropolitan \n", "\n", " Propertycount \n", "0 4019.0 \n", "1 4019.0 \n", "2 4019.0 \n", "3 4019.0 \n", "4 4019.0 \n", "\n", "[5 rows x 21 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "094c00f8-7409-4890-ba4c-75b361e2a10f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 34857 entries, 0 to 34856\n", "Data columns (total 21 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Suburb 34857 non-null object \n", " 1 Address 34857 non-null object \n", " 2 Rooms 34857 non-null int64 \n", " 3 Type 34857 non-null object \n", " 4 Price 27247 non-null float64\n", " 5 Method 34857 non-null object \n", " 6 SellerG 34857 non-null object \n", " 7 Date 34857 non-null object \n", " 8 Distance 34856 non-null float64\n", " 9 Postcode 34856 non-null float64\n", " 10 Bedroom2 26640 non-null float64\n", " 11 Bathroom 26631 non-null float64\n", " 12 Car 26129 non-null float64\n", " 13 Landsize 23047 non-null float64\n", " 14 BuildingArea 13742 non-null float64\n", " 15 YearBuilt 15551 non-null float64\n", " 16 CouncilArea 34854 non-null object \n", " 17 Lattitude 26881 non-null float64\n", " 18 Longtitude 26881 non-null float64\n", " 19 Regionname 34854 non-null object \n", " 20 Propertycount 34854 non-null float64\n", "dtypes: float64(12), int64(1), object(8)\n", "memory usage: 5.6+ MB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 6, "id": "f38712ee-05f6-4e84-869f-3daa9c1dfc1e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Suburb 351\n", "Address 34009\n", "Rooms 12\n", "Type 3\n", "Price 2871\n", "Method 9\n", "SellerG 388\n", "Date 78\n", "Distance 215\n", "Postcode 211\n", "Bedroom2 15\n", "Bathroom 11\n", "Car 15\n", "Landsize 1684\n", "BuildingArea 740\n", "YearBuilt 160\n", "CouncilArea 33\n", "Lattitude 13402\n", "Longtitude 14524\n", "Regionname 8\n", "Propertycount 342\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.nunique()" ] }, { "cell_type": "code", "execution_count": 7, "id": "40f7c561-4daa-48d5-a956-827f121c844e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(34857, 21)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 8, "id": "4de8ee87-e2de-4af3-a743-955b04703a0c", "metadata": {}, "outputs": [], "source": [ "cols = [\"Suburb\", \"Rooms\", \"Type\", \"Method\", \"SellerG\", \"Regionname\",\n", " \"Propertycount\", \"Distance\", \"CouncilArea\", \"Bedroom2\", \"Bathroom\",\n", " \"Car\", \"Landsize\", \"BuildingArea\", \"Price\"]" ] }, { "cell_type": "code", "execution_count": 9, "id": "b4293031-9acf-4f1b-8c3c-557d7e9f5147", "metadata": {}, "outputs": [], "source": [ "dataset = df[cols]" ] }, { "cell_type": "code", "execution_count": 10, "id": "cca9b80b-6f14-453e-86de-d322c2e5ccc0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(34857, 15)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.shape" ] }, { "cell_type": "code", "execution_count": 11, "id": "efff86a4-99cb-4226-b38a-38f60782c752", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Suburb 0\n", "Rooms 0\n", "Type 0\n", "Method 0\n", "SellerG 0\n", "Regionname 3\n", "Propertycount 3\n", "Distance 1\n", "CouncilArea 3\n", "Bedroom2 8217\n", "Bathroom 8226\n", "Car 8728\n", "Landsize 11810\n", "BuildingArea 21115\n", "Price 7610\n", "dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.isna().sum()" ] }, { "cell_type": "code", "execution_count": 12, "id": "e95473e0-4ea4-46cc-bdcd-a985da606fc1", "metadata": {}, "outputs": [], "source": [ "cols_fill_zero = [\"Propertycount\", \"Distance\", \"Bedroom2\", \"Bathroom\", \"Car\"]" ] }, { "cell_type": "code", "execution_count": 13, "id": "b8d2b695-7d88-4814-8798-f58192b757a9", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Stark-lin\\AppData\\Local\\Temp\\ipykernel_17120\\3566706403.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " dataset[cols_fill_zero] = dataset[cols_fill_zero].fillna(0)\n" ] } ], "source": [ "dataset[cols_fill_zero] = dataset[cols_fill_zero].fillna(0)" ] }, { "cell_type": "code", "execution_count": 14, "id": "00af7e86-109d-4dc4-adc7-0a6bc33308aa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Suburb 0\n", "Rooms 0\n", "Type 0\n", "Method 0\n", "SellerG 0\n", "Regionname 3\n", "Propertycount 0\n", "Distance 0\n", "CouncilArea 3\n", "Bedroom2 0\n", "Bathroom 0\n", "Car 0\n", "Landsize 11810\n", "BuildingArea 21115\n", "Price 7610\n", "dtype: int64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.isna().sum()" ] }, { "cell_type": "code", "execution_count": 15, "id": "b398a29f-58a4-44d1-9cc2-c3c881fe9a8b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Stark-lin\\AppData\\Local\\Temp\\ipykernel_17120\\680991224.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " dataset[\"Landsize\"] = dataset[\"Landsize\"].fillna(dataset.Landsize.mean())\n", "C:\\Users\\Stark-lin\\AppData\\Local\\Temp\\ipykernel_17120\\680991224.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " dataset[\"BuildingArea\"] = dataset[\"BuildingArea\"].fillna(dataset.BuildingArea.mean())\n" ] } ], "source": [ "dataset[\"Landsize\"] = dataset[\"Landsize\"].fillna(dataset.Landsize.mean())\n", "dataset[\"BuildingArea\"] = dataset[\"BuildingArea\"].fillna(dataset.BuildingArea.mean())" ] }, { "cell_type": "code", "execution_count": 16, "id": "0943b543-02c3-4532-bef3-527e1e12f4ad", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Suburb 0\n", "Rooms 0\n", "Type 0\n", "Method 0\n", "SellerG 0\n", "Regionname 3\n", "Propertycount 0\n", "Distance 0\n", "CouncilArea 3\n", "Bedroom2 0\n", "Bathroom 0\n", "Car 0\n", "Landsize 0\n", "BuildingArea 0\n", "Price 7610\n", "dtype: int64" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.isna().sum()" ] }, { "cell_type": "code", "execution_count": 17, "id": "beb8bf99-aad3-433a-8e55-2bbbd47bf3ad", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Stark-lin\\AppData\\Local\\Temp\\ipykernel_17120\\2639109627.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " dataset.dropna(inplace=True)\n" ] } ], "source": [ "dataset.dropna(inplace=True)" ] }, { "cell_type": "code", "execution_count": 18, "id": "c3ea83e3-077d-423b-91c9-dcbc1ad1fb44", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Suburb 0\n", "Rooms 0\n", "Type 0\n", "Method 0\n", "SellerG 0\n", "Regionname 0\n", "Propertycount 0\n", "Distance 0\n", "CouncilArea 0\n", "Bedroom2 0\n", "Bathroom 0\n", "Car 0\n", "Landsize 0\n", "BuildingArea 0\n", "Price 0\n", "dtype: int64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.isna().sum()" ] }, { "cell_type": "code", "execution_count": 19, "id": "30b78704-17f5-48e3-ac10-1467daef8c0c", "metadata": {}, "outputs": [], "source": [ "dataset = pd.get_dummies(dataset, drop_first=True)#drop_first避免虚拟变量陷进" ] }, { "cell_type": "code", "execution_count": 20, "id": "b047f7d8-2625-406c-81c9-77a027a484f9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(27244, 745)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.shape" ] }, { "cell_type": "code", "execution_count": 21, "id": "8119b464-3619-4164-b21b-12e954d7d456", "metadata": {}, "outputs": [], "source": [ "x = dataset.drop(\"Price\", axis=1)\n", "y = dataset[\"Price\"]" ] }, { "cell_type": "code", "execution_count": 22, "id": "84fb0433-1802-4c90-95cc-bb4138592661", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 23, "id": "1c54e05b-1931-438a-8d2e-4553f457ffb3", "metadata": {}, "outputs": [], "source": [ "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)" ] }, { "cell_type": "code", "execution_count": 24, "id": "46623822-6158-4345-abb0-0ab6a3788dfd", "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression" ] }, { "cell_type": "code", "execution_count": 25, "id": "72752dc5-aaf3-45f6-80aa-7e232f754fbe", "metadata": {}, "outputs": [], "source": [ "lr = LinearRegression()" ] }, { "cell_type": "code", "execution_count": 26, "id": "275f1536-463c-45cb-868f-d1547a98e033", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "LinearRegression()" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lr.fit(x_train, y_train)" ] }, { "cell_type": "code", "execution_count": 27, "id": "5bb0d7ce-1e32-4e4b-a2c6-cc66ecd98744", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6737548533713945" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lr.score(x_test, y_test)" ] }, { "cell_type": "code", "execution_count": 28, "id": "c754b39b-a4a1-4685-9b36-22ba204c1028", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6797993843396896" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lr.score(x_train, y_train)" ] }, { "cell_type": "markdown", "id": "a24ee128-32db-499a-b1d6-e5fa7de1f2e4", "metadata": {}, "source": [ "$$(1 / (2 * n\\_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1$$" ] }, { "cell_type": "code", "execution_count": 29, "id": "1dc6e7f7-47d0-4c80-b922-c8d44b258fb1", "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Lasso" ] }, { "cell_type": "code", "execution_count": 30, "id": "ae939a27-a28b-420a-97e6-48bb220bfcf8", "metadata": {}, "outputs": [], "source": [ "l1_lr = Lasso(alpha=0.1, max_iter=1000, tol=0.0001)" ] }, { "cell_type": "code", "execution_count": 31, "id": "b405de3e-2e37-4f96-840b-999568cadf8a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "D:\\envs\\stark\\lib\\site-packages\\sklearn\\linear_model\\_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.476e+15, tolerance: 9.217e+11\n", " model = cd_fast.enet_coordinate_descent(\n" ] }, { "data": { "text/html": [ "
Lasso(alpha=0.1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Lasso(alpha=0.1)" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "l1_lr.fit(x_train, y_train)" ] }, { "cell_type": "code", "execution_count": 32, "id": "93007622-75c8-4f3e-93ac-62354494fe32", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6797973052867211" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "l1_lr.score(x_train, y_train)" ] }, { "cell_type": "code", "execution_count": 33, "id": "4aed753a-f772-4cda-b304-65cc0d6ad7db", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6737830516738357" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "l1_lr.score(x_test, y_test)" ] }, { "cell_type": "markdown", "id": "3e89bd7d-0a48-44be-b085-3188d8ad9e40", "metadata": {}, "source": [ "$$||y - Xw||^2_2 + alpha * ||w||^2_2$$" ] }, { "cell_type": "code", "execution_count": 34, "id": "0c98d7c3-3d65-47d6-a8c4-3b144574496b", "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Ridge" ] }, { "cell_type": "code", "execution_count": 35, "id": "cc333c83-6cb3-4b8e-95bb-6d3053092fab", "metadata": {}, "outputs": [], "source": [ "l2_lr = Ridge(alpha=0.1, max_iter=1000, tol=0.0001)" ] }, { "cell_type": "code", "execution_count": 36, "id": "4cc4a3e2-8012-43ee-842a-863db51a436f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Ridge(alpha=0.1, max_iter=1000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Ridge(alpha=0.1, max_iter=1000)" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "l2_lr.fit(x_train, y_train)" ] }, { "cell_type": "code", "execution_count": 37, "id": "6f3e1601-dce2-4e2c-9f55-82671c359d78", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6797776548635094" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "l2_lr.score(x_train, y_train)" ] }, { "cell_type": "code", "execution_count": 38, "id": "e1fb30ae-4cf2-4c54-be50-80aa515f4149", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6741118456030124" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "l2_lr.score(x_test, y_test)" ] }, { "cell_type": "code", "execution_count": null, "id": "247e0a14-2234-4853-aa85-bb3ccdc403b4", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 5 }