{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "33050e33", "metadata": { "scrolled": true }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "id": "4b8f9c58", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"salaries.csv\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "974d9dcf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
companyjobdegreesalary_more_then_100k
0googlesales executivebachelors0
1googlesales executivemasters0
2googlebusiness managerbachelors1
3googlebusiness managermasters1
4googlecomputer programmerbachelors0
\n", "
" ], "text/plain": [ " company job degree salary_more_then_100k\n", "0 google sales executive bachelors 0\n", "1 google sales executive masters 0\n", "2 google business manager bachelors 1\n", "3 google business manager masters 1\n", "4 google computer programmer bachelors 0" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 6, "id": "e55303b1", "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import LabelEncoder" ] }, { "cell_type": "code", "execution_count": 7, "id": "7b55fb49", "metadata": {}, "outputs": [], "source": [ "le_company = LabelEncoder()\n", "le_job = LabelEncoder()\n", "le_degree = LabelEncoder()" ] }, { "cell_type": "code", "execution_count": 10, "id": "e271294f", "metadata": {}, "outputs": [], "source": [ "company_n = le_company.fit_transform(df.company)\n", "job_n = le_company.fit_transform(df.job)\n", "degree_n = le_company.fit_transform(df.degree)" ] }, { "cell_type": "code", "execution_count": 11, "id": "a2e7b968", "metadata": {}, "outputs": [], "source": [ "salary = df.salary_more_then_100k" ] }, { "cell_type": "code", "execution_count": 12, "id": "769ad437", "metadata": {}, "outputs": [], "source": [ "df.company, df.job, df.degree = company_n, job_n, degree_n" ] }, { "cell_type": "code", "execution_count": 15, "id": "00cdea98", "metadata": {}, "outputs": [], "source": [ "features = df.drop(\"salary_more_then_100k\", axis=\"columns\")" ] }, { "cell_type": "code", "execution_count": 16, "id": "accdeb52", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
companyjobdegree
0220
1221
2200
3201
4210
\n", "
" ], "text/plain": [ " company job degree\n", "0 2 2 0\n", "1 2 2 1\n", "2 2 0 0\n", "3 2 0 1\n", "4 2 1 0" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "features.head()" ] }, { "cell_type": "code", "execution_count": 17, "id": "d3592e22", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 18, "id": "46bf28c9", "metadata": {}, "outputs": [], "source": [ "x_train, x_test, y_train, y_test = train_test_split(features, salary, test_size=0.2, random_state=10)" ] }, { "cell_type": "code", "execution_count": 19, "id": "0d414a37", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(16, 12, 4)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(features), len(x_train), len(x_test)" ] }, { "cell_type": "code", "execution_count": 20, "id": "c5df5983", "metadata": {}, "outputs": [], "source": [ "from sklearn import tree" ] }, { "cell_type": "code", "execution_count": 21, "id": "7c36264c", "metadata": {}, "outputs": [], "source": [ "dtc = tree.DecisionTreeClassifier()" ] }, { "cell_type": "code", "execution_count": 22, "id": "5655fe44", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DecisionTreeClassifier()" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dtc.fit(x_train, y_train)" ] }, { "cell_type": "code", "execution_count": 23, "id": "7f542c33", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.75" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dtc.score(x_test, y_test)" ] }, { "cell_type": "code", "execution_count": 25, "id": "3e61b2f3", "metadata": {}, "outputs": [], "source": [ "pred = dtc.predict(x_test)" ] }, { "cell_type": "code", "execution_count": 24, "id": "99f4aa4f", "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import confusion_matrix" ] }, { "cell_type": "code", "execution_count": 27, "id": "4087c69d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1, 0],\n", " [1, 2]], dtype=int64)" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "confusion_matrix(y_test, pred)" ] }, { "cell_type": "markdown", "id": "31569bc2", "metadata": {}, "source": [ "titanic " ] }, { "cell_type": "code", "execution_count": 28, "id": "fb427dfd", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"titanic.csv\")" ] }, { "cell_type": "code", "execution_count": 29, "id": "734fbe56", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 31, "id": "236e74ea", "metadata": {}, "outputs": [], "source": [ "df = df.drop([\"PassengerId\", \"Name\", \"SibSp\", \"Parch\", \"Ticket\", \"Cabin\", \"Embarked\"], axis=\"columns\")#删掉这些不会影响是否能够生存" ] }, { "cell_type": "code", "execution_count": 32, "id": "a348cb79", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeFare
003male22.07.2500
111female38.071.2833
213female26.07.9250
311female35.053.1000
403male35.08.0500
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age Fare\n", "0 0 3 male 22.0 7.2500\n", "1 1 1 female 38.0 71.2833\n", "2 1 3 female 26.0 7.9250\n", "3 1 1 female 35.0 53.1000\n", "4 0 3 male 35.0 8.0500" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 33, "id": "1c83ecbb", "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import LabelEncoder" ] }, { "cell_type": "code", "execution_count": 34, "id": "31934ad8", "metadata": {}, "outputs": [], "source": [ "le = LabelEncoder()" ] }, { "cell_type": "code", "execution_count": 35, "id": "a8c11cb0", "metadata": {}, "outputs": [], "source": [ "df.Sex = le.fit_transform(df.Sex)" ] }, { "cell_type": "code", "execution_count": 36, "id": "0ee729dd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeFare
003122.07.2500
111038.071.2833
213026.07.9250
311035.053.1000
403135.08.0500
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age Fare\n", "0 0 3 1 22.0 7.2500\n", "1 1 1 0 38.0 71.2833\n", "2 1 3 0 26.0 7.9250\n", "3 1 1 0 35.0 53.1000\n", "4 0 3 1 35.0 8.0500" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 37, "id": "6e78bdfc", "metadata": {}, "outputs": [], "source": [ "survived = df.Survived" ] }, { "cell_type": "code", "execution_count": 39, "id": "06950393", "metadata": {}, "outputs": [], "source": [ "features = df.drop(\"Survived\", axis=\"columns\")" ] }, { "cell_type": "markdown", "id": "3703250b", "metadata": {}, "source": [ "数据中有缺失,需要处理。" ] }, { "cell_type": "code", "execution_count": 56, "id": "8df08717", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pclass 0\n", "Sex 0\n", "Age 177\n", "Fare 0\n", "dtype: int64" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "features.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 59, "id": "6bbac05e", "metadata": {}, "outputs": [], "source": [ "features.Age = features.Age.fillna(features.Age.median())" ] }, { "cell_type": "code", "execution_count": 61, "id": "74fdd7ec", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pclass 0\n", "Sex 0\n", "Age 0\n", "Fare 0\n", "dtype: int64" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "features.isnull().sum()#已经没有缺失数据" ] }, { "cell_type": "code", "execution_count": 62, "id": "ce016346", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 63, "id": "1cd7238f", "metadata": {}, "outputs": [], "source": [ "x_train, x_test, y_train, y_test = train_test_split(features, survived, test_size=0.2, random_state=10)" ] }, { "cell_type": "code", "execution_count": 64, "id": "fa0b077c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(891, 712, 179)" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(features), len(x_train), len(x_test)" ] }, { "cell_type": "code", "execution_count": 65, "id": "50eba766", "metadata": {}, "outputs": [], "source": [ "from sklearn import tree" ] }, { "cell_type": "code", "execution_count": 66, "id": "eb37b412", "metadata": {}, "outputs": [], "source": [ "dct = tree.DecisionTreeClassifier()" ] }, { "cell_type": "code", "execution_count": 67, "id": "a3e977e4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DecisionTreeClassifier()" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dct.fit(x_train, y_train)" ] }, { "cell_type": "code", "execution_count": 68, "id": "50f6560c", "metadata": {}, "outputs": [], "source": [ "y_pred = dct.predict(x_test)" ] }, { "cell_type": "code", "execution_count": 71, "id": "30e62926", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8268156424581006" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(y_pred == y_test).sum()/len(y_test)" ] }, { "cell_type": "code", "execution_count": 73, "id": "72259345", "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import confusion_matrix" ] }, { "cell_type": "code", "execution_count": 74, "id": "e46f73ce", "metadata": {}, "outputs": [], "source": [ "cm = confusion_matrix(y_test, y_pred)" ] }, { "cell_type": "code", "execution_count": 75, "id": "e4aeef28", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[102, 15],\n", " [ 16, 46]], dtype=int64)" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cm" ] }, { "cell_type": "code", "execution_count": null, "id": "b14a69f2", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 5 }