You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

688 lines
23 KiB
Plaintext

6 months ago
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "9c8d8367-9587-4e87-ad42-2debf858dfcb",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "bece23f0-ea8b-450b-b82a-63bddc3c1de7",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"diabetes.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "554e5c95-083e-45fc-9438-b2121ec20b4f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Pregnancies</th>\n",
" <th>Glucose</th>\n",
" <th>BloodPressure</th>\n",
" <th>SkinThickness</th>\n",
" <th>Insulin</th>\n",
" <th>BMI</th>\n",
" <th>DiabetesPedigreeFunction</th>\n",
" <th>Age</th>\n",
" <th>Outcome</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6</td>\n",
" <td>148</td>\n",
" <td>72</td>\n",
" <td>35</td>\n",
" <td>0</td>\n",
" <td>33.6</td>\n",
" <td>0.627</td>\n",
" <td>50</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>85</td>\n",
" <td>66</td>\n",
" <td>29</td>\n",
" <td>0</td>\n",
" <td>26.6</td>\n",
" <td>0.351</td>\n",
" <td>31</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8</td>\n",
" <td>183</td>\n",
" <td>64</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>23.3</td>\n",
" <td>0.672</td>\n",
" <td>32</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>89</td>\n",
" <td>66</td>\n",
" <td>23</td>\n",
" <td>94</td>\n",
" <td>28.1</td>\n",
" <td>0.167</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>137</td>\n",
" <td>40</td>\n",
" <td>35</td>\n",
" <td>168</td>\n",
" <td>43.1</td>\n",
" <td>2.288</td>\n",
" <td>33</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"0 0.627 50 1 \n",
"1 0.351 31 0 \n",
"2 0.672 32 1 \n",
"3 0.167 21 0 \n",
"4 2.288 33 1 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "735d8795-1dac-4623-be2d-7fb2a2f5049e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pregnancies 0\n",
"Glucose 0\n",
"BloodPressure 0\n",
"SkinThickness 0\n",
"Insulin 0\n",
"BMI 0\n",
"DiabetesPedigreeFunction 0\n",
"Age 0\n",
"Outcome 0\n",
"dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "7a10a77a-d90f-4b0b-a951-d1c7c46202c7",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import StandardScaler"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "301f9c12-aeae-45b1-8689-18cc6e00718d",
"metadata": {},
"outputs": [],
"source": [
"y = df.Outcome\n",
"x = df.drop(\"Outcome\", axis=\"columns\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "fa9a0f52-5647-429c-8b76-df87a3243519",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Pregnancies</th>\n",
" <th>Glucose</th>\n",
" <th>BloodPressure</th>\n",
" <th>SkinThickness</th>\n",
" <th>Insulin</th>\n",
" <th>BMI</th>\n",
" <th>DiabetesPedigreeFunction</th>\n",
" <th>Age</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6</td>\n",
" <td>148</td>\n",
" <td>72</td>\n",
" <td>35</td>\n",
" <td>0</td>\n",
" <td>33.6</td>\n",
" <td>0.627</td>\n",
" <td>50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>85</td>\n",
" <td>66</td>\n",
" <td>29</td>\n",
" <td>0</td>\n",
" <td>26.6</td>\n",
" <td>0.351</td>\n",
" <td>31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8</td>\n",
" <td>183</td>\n",
" <td>64</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>23.3</td>\n",
" <td>0.672</td>\n",
" <td>32</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>89</td>\n",
" <td>66</td>\n",
" <td>23</td>\n",
" <td>94</td>\n",
" <td>28.1</td>\n",
" <td>0.167</td>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>137</td>\n",
" <td>40</td>\n",
" <td>35</td>\n",
" <td>168</td>\n",
" <td>43.1</td>\n",
" <td>2.288</td>\n",
" <td>33</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
"\n",
" DiabetesPedigreeFunction Age \n",
"0 0.627 50 \n",
"1 0.351 31 \n",
"2 0.672 32 \n",
"3 0.167 21 \n",
"4 2.288 33 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c7682595-b4fd-4fbc-8fca-e4a9300a4378",
"metadata": {},
"outputs": [],
"source": [
"scaler = StandardScaler()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "f4af8802-d94e-4fa7-a993-bd33e8fd56a5",
"metadata": {},
"outputs": [],
"source": [
"x_scalered = scaler.fit_transform(x)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "84572647-fa48-4807-b58f-eef4fb7650bc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.63994726, 0.84832379, 0.14964075, 0.90726993, -0.69289057,\n",
" 0.20401277, 0.46849198, 1.4259954 ],\n",
" [-0.84488505, -1.12339636, -0.16054575, 0.53090156, -0.69289057,\n",
" -0.68442195, -0.36506078, -0.19067191],\n",
" [ 1.23388019, 1.94372388, -0.26394125, -1.28821221, -0.69289057,\n",
" -1.10325546, 0.60439732, -0.10558415]])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x_scalered[:3]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "eea39bb8-93dd-4bcd-b509-10cfff7b5fbb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(500, 268)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(y[y==0]), len(y[y==1])#数据还算平衡"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "725d9479-9a33-4d8b-93b5-256ebef8c078",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.536"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(y[y==1])/len(y[y==0])"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "9983a9ff-6aff-4f2e-8d3a-4dfe52c0a580",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "2b049926-9155-40c9-803e-a9f8a75da355",
"metadata": {},
"outputs": [],
"source": [
"x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.3)#stratify使拆分后比例一样"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "d36e5561-5ded-473d-9161-59ee4e2f7755",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.5342857142857143"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(y_train[y_train==1])/len(y_train[y_train==0])"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "e8c4fdb1-2730-46c5-99f8-d2cb4c6d1d19",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.tree import DecisionTreeClassifier"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "f4cdd4a2-b7db-4e0c-bb7e-436796b38f77",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import cross_val_score"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "62495661-ca1e-43d4-9da7-2591acfcf639",
"metadata": {},
"outputs": [],
"source": [
"cvs = cross_val_score(DecisionTreeClassifier(), x, y, cv=5)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "3d10246e-3e29-4037-a166-10b365381b29",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "0be2cea0-bfca-44dd-85d3-85de1e14355e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7110432051608522"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.mean(cvs)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "da9f0cde-1b1f-45b2-9d18-50de178bdd65",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import BaggingClassifier"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "0a7f52d2-f648-4062-946b-f178823bb29d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-r
" n_estimators=100, oob_score=True, random_state=0)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" ><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">BaggingClassifier</label><div class=\"sk-toggleable__content\"><pre>BaggingClassifier(estimator=DecisionTreeClassifier(), max_samples=0.8,\n",
" n_estimators=100, oob_score=True, random_state=0)</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" ><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">estimator: DecisionTreeClassifier</label><div class=\"sk-toggleable__content\"><pre>DecisionTreeClassifier()</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" ><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">DecisionTreeClassifier</label><div class=\"sk-toggleable__content\"><pre>DecisionTreeClassifier()</pre></div></div></div></div></div></div></div></div></div></div>"
],
"text/plain": [
"BaggingClassifier(estimator=DecisionTreeClassifier(), max_samples=0.8,\n",
" n_estimators=100, oob_score=True, random_state=0)"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bagg = BaggingClassifier(estimator=DecisionTreeClassifier(),\n",
" n_estimators=100,\n",
" max_samples=0.8,\n",
" oob_score=True,\n",
" random_state=0\n",
" )\n",
"bagg.fit(x_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "0e94db07-919f-4911-8c37-0516649628f8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7392923649906891"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bagg.oob_score_"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "6ed41070-4666-4d84-b5c7-3be05809aec9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7662337662337663"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bagg.score(x_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "75149b99-951c-4566-b039-ee7142886a97",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7578728461081402"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bagg = BaggingClassifier(estimator=DecisionTreeClassifier(),\n",
" n_estimators=100,\n",
" max_samples=0.8,\n",
" oob_score=True,\n",
" random_state=0\n",
" )\n",
"np.mean(cross_val_score(bagg, x, y, cv=5))"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "fdec9f7e-4b83-462f-ae41-0aeb1a20da7a",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "1ca4c92b-d7c1-4c5e-b2fa-f5331ca4ccd5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7657074951192598"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cross_val_score(RandomForestClassifier(), x, y, cv=5).mean()#随机森林就是bagging"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08f53b7a-a348-4e12-83e1-3d04d43d7907",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}