You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1089 lines
25 KiB
Plaintext
1089 lines
25 KiB
Plaintext
6 months ago
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"id": "33050e33",
|
||
|
"metadata": {
|
||
|
"scrolled": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import pandas as pd"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 3,
|
||
|
"id": "4b8f9c58",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df = pd.read_csv(\"salaries.csv\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 5,
|
||
|
"id": "974d9dcf",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>company</th>\n",
|
||
|
" <th>job</th>\n",
|
||
|
" <th>degree</th>\n",
|
||
|
" <th>salary_more_then_100k</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>0</th>\n",
|
||
|
" <td>google</td>\n",
|
||
|
" <td>sales executive</td>\n",
|
||
|
" <td>bachelors</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>1</th>\n",
|
||
|
" <td>google</td>\n",
|
||
|
" <td>sales executive</td>\n",
|
||
|
" <td>masters</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>2</th>\n",
|
||
|
" <td>google</td>\n",
|
||
|
" <td>business manager</td>\n",
|
||
|
" <td>bachelors</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>google</td>\n",
|
||
|
" <td>business manager</td>\n",
|
||
|
" <td>masters</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>google</td>\n",
|
||
|
" <td>computer programmer</td>\n",
|
||
|
" <td>bachelors</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" company job degree salary_more_then_100k\n",
|
||
|
"0 google sales executive bachelors 0\n",
|
||
|
"1 google sales executive masters 0\n",
|
||
|
"2 google business manager bachelors 1\n",
|
||
|
"3 google business manager masters 1\n",
|
||
|
"4 google computer programmer bachelors 0"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 5,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.head()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 6,
|
||
|
"id": "e55303b1",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from sklearn.preprocessing import LabelEncoder"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 7,
|
||
|
"id": "7b55fb49",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"le_company = LabelEncoder()\n",
|
||
|
"le_job = LabelEncoder()\n",
|
||
|
"le_degree = LabelEncoder()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 10,
|
||
|
"id": "e271294f",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"company_n = le_company.fit_transform(df.company)\n",
|
||
|
"job_n = le_company.fit_transform(df.job)\n",
|
||
|
"degree_n = le_company.fit_transform(df.degree)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 11,
|
||
|
"id": "a2e7b968",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"salary = df.salary_more_then_100k"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 12,
|
||
|
"id": "769ad437",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df.company, df.job, df.degree = company_n, job_n, degree_n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 15,
|
||
|
"id": "00cdea98",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"features = df.drop(\"salary_more_then_100k\", axis=\"columns\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 16,
|
||
|
"id": "accdeb52",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>company</th>\n",
|
||
|
" <th>job</th>\n",
|
||
|
" <th>degree</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>0</th>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>1</th>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>2</th>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" company job degree\n",
|
||
|
"0 2 2 0\n",
|
||
|
"1 2 2 1\n",
|
||
|
"2 2 0 0\n",
|
||
|
"3 2 0 1\n",
|
||
|
"4 2 1 0"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 16,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"features.head()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 17,
|
||
|
"id": "d3592e22",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from sklearn.model_selection import train_test_split"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 18,
|
||
|
"id": "46bf28c9",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"x_train, x_test, y_train, y_test = train_test_split(features, salary, test_size=0.2, random_state=10)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 19,
|
||
|
"id": "0d414a37",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"(16, 12, 4)"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 19,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"len(features), len(x_train), len(x_test)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 20,
|
||
|
"id": "c5df5983",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from sklearn import tree"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 21,
|
||
|
"id": "7c36264c",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"dtc = tree.DecisionTreeClassifier()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 22,
|
||
|
"id": "5655fe44",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"DecisionTreeClassifier()"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 22,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"dtc.fit(x_train, y_train)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 23,
|
||
|
"id": "7f542c33",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"0.75"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 23,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"dtc.score(x_test, y_test)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 25,
|
||
|
"id": "3e61b2f3",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"pred = dtc.predict(x_test)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 24,
|
||
|
"id": "99f4aa4f",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from sklearn.metrics import confusion_matrix"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 27,
|
||
|
"id": "4087c69d",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"array([[1, 0],\n",
|
||
|
" [1, 2]], dtype=int64)"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 27,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"confusion_matrix(y_test, pred)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "31569bc2",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"titanic "
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 28,
|
||
|
"id": "fb427dfd",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df = pd.read_csv(\"titanic.csv\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 29,
|
||
|
"id": "734fbe56",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>PassengerId</th>\n",
|
||
|
" <th>Survived</th>\n",
|
||
|
" <th>Pclass</th>\n",
|
||
|
" <th>Name</th>\n",
|
||
|
" <th>Sex</th>\n",
|
||
|
" <th>Age</th>\n",
|
||
|
" <th>SibSp</th>\n",
|
||
|
" <th>Parch</th>\n",
|
||
|
" <th>Ticket</th>\n",
|
||
|
" <th>Fare</th>\n",
|
||
|
" <th>Cabin</th>\n",
|
||
|
" <th>Embarked</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>0</th>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>Braund, Mr. Owen Harris</td>\n",
|
||
|
" <td>male</td>\n",
|
||
|
" <td>22.0</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>A/5 21171</td>\n",
|
||
|
" <td>7.2500</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>S</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>1</th>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
|
||
|
" <td>female</td>\n",
|
||
|
" <td>38.0</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>PC 17599</td>\n",
|
||
|
" <td>71.2833</td>\n",
|
||
|
" <td>C85</td>\n",
|
||
|
" <td>C</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>2</th>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>Heikkinen, Miss. Laina</td>\n",
|
||
|
" <td>female</td>\n",
|
||
|
" <td>26.0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>STON/O2. 3101282</td>\n",
|
||
|
" <td>7.9250</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>S</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
|
||
|
" <td>female</td>\n",
|
||
|
" <td>35.0</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>113803</td>\n",
|
||
|
" <td>53.1000</td>\n",
|
||
|
" <td>C123</td>\n",
|
||
|
" <td>S</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>5</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>Allen, Mr. William Henry</td>\n",
|
||
|
" <td>male</td>\n",
|
||
|
" <td>35.0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>373450</td>\n",
|
||
|
" <td>8.0500</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>S</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" PassengerId Survived Pclass \\\n",
|
||
|
"0 1 0 3 \n",
|
||
|
"1 2 1 1 \n",
|
||
|
"2 3 1 3 \n",
|
||
|
"3 4 1 1 \n",
|
||
|
"4 5 0 3 \n",
|
||
|
"\n",
|
||
|
" Name Sex Age SibSp \\\n",
|
||
|
"0 Braund, Mr. Owen Harris male 22.0 1 \n",
|
||
|
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
|
||
|
"2 Heikkinen, Miss. Laina female 26.0 0 \n",
|
||
|
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
|
||
|
"4 Allen, Mr. William Henry male 35.0 0 \n",
|
||
|
"\n",
|
||
|
" Parch Ticket Fare Cabin Embarked \n",
|
||
|
"0 0 A/5 21171 7.2500 NaN S \n",
|
||
|
"1 0 PC 17599 71.2833 C85 C \n",
|
||
|
"2 0 STON/O2. 3101282 7.9250 NaN S \n",
|
||
|
"3 0 113803 53.1000 C123 S \n",
|
||
|
"4 0 373450 8.0500 NaN S "
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 29,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.head()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 31,
|
||
|
"id": "236e74ea",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df = df.drop([\"PassengerId\", \"Name\", \"SibSp\", \"Parch\", \"Ticket\", \"Cabin\", \"Embarked\"], axis=\"columns\")#删掉这些不会影响是否能够生存"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 32,
|
||
|
"id": "a348cb79",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>Survived</th>\n",
|
||
|
" <th>Pclass</th>\n",
|
||
|
" <th>Sex</th>\n",
|
||
|
" <th>Age</th>\n",
|
||
|
" <th>Fare</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>0</th>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>male</td>\n",
|
||
|
" <td>22.0</td>\n",
|
||
|
" <td>7.2500</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>1</th>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>female</td>\n",
|
||
|
" <td>38.0</td>\n",
|
||
|
" <td>71.2833</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>2</th>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>female</td>\n",
|
||
|
" <td>26.0</td>\n",
|
||
|
" <td>7.9250</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>female</td>\n",
|
||
|
" <td>35.0</td>\n",
|
||
|
" <td>53.1000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>male</td>\n",
|
||
|
" <td>35.0</td>\n",
|
||
|
" <td>8.0500</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" Survived Pclass Sex Age Fare\n",
|
||
|
"0 0 3 male 22.0 7.2500\n",
|
||
|
"1 1 1 female 38.0 71.2833\n",
|
||
|
"2 1 3 female 26.0 7.9250\n",
|
||
|
"3 1 1 female 35.0 53.1000\n",
|
||
|
"4 0 3 male 35.0 8.0500"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 32,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.head()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 33,
|
||
|
"id": "1c83ecbb",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from sklearn.preprocessing import LabelEncoder"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 34,
|
||
|
"id": "31934ad8",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"le = LabelEncoder()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 35,
|
||
|
"id": "a8c11cb0",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df.Sex = le.fit_transform(df.Sex)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 36,
|
||
|
"id": "0ee729dd",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>Survived</th>\n",
|
||
|
" <th>Pclass</th>\n",
|
||
|
" <th>Sex</th>\n",
|
||
|
" <th>Age</th>\n",
|
||
|
" <th>Fare</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>0</th>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>22.0</td>\n",
|
||
|
" <td>7.2500</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>1</th>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>38.0</td>\n",
|
||
|
" <td>71.2833</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>2</th>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>26.0</td>\n",
|
||
|
" <td>7.9250</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>35.0</td>\n",
|
||
|
" <td>53.1000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>35.0</td>\n",
|
||
|
" <td>8.0500</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" Survived Pclass Sex Age Fare\n",
|
||
|
"0 0 3 1 22.0 7.2500\n",
|
||
|
"1 1 1 0 38.0 71.2833\n",
|
||
|
"2 1 3 0 26.0 7.9250\n",
|
||
|
"3 1 1 0 35.0 53.1000\n",
|
||
|
"4 0 3 1 35.0 8.0500"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 36,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.head()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 37,
|
||
|
"id": "6e78bdfc",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"survived = df.Survived"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 39,
|
||
|
"id": "06950393",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"features = df.drop(\"Survived\", axis=\"columns\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "3703250b",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"数据中有缺失,需要处理。"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 56,
|
||
|
"id": "8df08717",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"Pclass 0\n",
|
||
|
"Sex 0\n",
|
||
|
"Age 177\n",
|
||
|
"Fare 0\n",
|
||
|
"dtype: int64"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 56,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"features.isnull().sum()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 59,
|
||
|
"id": "6bbac05e",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"features.Age = features.Age.fillna(features.Age.median())"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 61,
|
||
|
"id": "74fdd7ec",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"Pclass 0\n",
|
||
|
"Sex 0\n",
|
||
|
"Age 0\n",
|
||
|
"Fare 0\n",
|
||
|
"dtype: int64"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 61,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"features.isnull().sum()#已经没有缺失数据"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 62,
|
||
|
"id": "ce016346",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from sklearn.model_selection import train_test_split"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 63,
|
||
|
"id": "1cd7238f",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"x_train, x_test, y_train, y_test = train_test_split(features, survived, test_size=0.2, random_state=10)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 64,
|
||
|
"id": "fa0b077c",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"(891, 712, 179)"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 64,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"len(features), len(x_train), len(x_test)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 65,
|
||
|
"id": "50eba766",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from sklearn import tree"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 66,
|
||
|
"id": "eb37b412",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"dct = tree.DecisionTreeClassifier()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 67,
|
||
|
"id": "a3e977e4",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"DecisionTreeClassifier()"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 67,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"dct.fit(x_train, y_train)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 68,
|
||
|
"id": "50f6560c",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"y_pred = dct.predict(x_test)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 71,
|
||
|
"id": "30e62926",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"0.8268156424581006"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 71,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"(y_pred == y_test).sum()/len(y_test)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 73,
|
||
|
"id": "72259345",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from sklearn.metrics import confusion_matrix"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 74,
|
||
|
"id": "e46f73ce",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"cm = confusion_matrix(y_test, y_pred)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 75,
|
||
|
"id": "e4aeef28",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"array([[102, 15],\n",
|
||
|
" [ 16, 46]], dtype=int64)"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 75,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"cm"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "b14a69f2",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3 (ipykernel)",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.7.9"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 5
|
||
|
}
|