{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "9c8d8367-9587-4e87-ad42-2debf858dfcb", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "bece23f0-ea8b-450b-b82a-63bddc3c1de7", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"diabetes.csv\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "554e5c95-083e-45fc-9438-b2121ec20b4f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
" ], "text/plain": [ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n", "0 6 148 72 35 0 33.6 \n", "1 1 85 66 29 0 26.6 \n", "2 8 183 64 0 0 23.3 \n", "3 1 89 66 23 94 28.1 \n", "4 0 137 40 35 168 43.1 \n", "\n", " DiabetesPedigreeFunction Age Outcome \n", "0 0.627 50 1 \n", "1 0.351 31 0 \n", "2 0.672 32 1 \n", "3 0.167 21 0 \n", "4 2.288 33 1 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "735d8795-1dac-4623-be2d-7fb2a2f5049e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pregnancies 0\n", "Glucose 0\n", "BloodPressure 0\n", "SkinThickness 0\n", "Insulin 0\n", "BMI 0\n", "DiabetesPedigreeFunction 0\n", "Age 0\n", "Outcome 0\n", "dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 5, "id": "7a10a77a-d90f-4b0b-a951-d1c7c46202c7", "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler" ] }, { "cell_type": "code", "execution_count": 7, "id": "301f9c12-aeae-45b1-8689-18cc6e00718d", "metadata": {}, "outputs": [], "source": [ "y = df.Outcome\n", "x = df.drop(\"Outcome\", axis=\"columns\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "fa9a0f52-5647-429c-8b76-df87a3243519", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
" ], "text/plain": [ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n", "0 6 148 72 35 0 33.6 \n", "1 1 85 66 29 0 26.6 \n", "2 8 183 64 0 0 23.3 \n", "3 1 89 66 23 94 28.1 \n", "4 0 137 40 35 168 43.1 \n", "\n", " DiabetesPedigreeFunction Age \n", "0 0.627 50 \n", "1 0.351 31 \n", "2 0.672 32 \n", "3 0.167 21 \n", "4 2.288 33 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.head()" ] }, { "cell_type": "code", "execution_count": 10, "id": "c7682595-b4fd-4fbc-8fca-e4a9300a4378", "metadata": {}, "outputs": [], "source": [ "scaler = StandardScaler()" ] }, { "cell_type": "code", "execution_count": 11, "id": "f4af8802-d94e-4fa7-a993-bd33e8fd56a5", "metadata": {}, "outputs": [], "source": [ "x_scalered = scaler.fit_transform(x)" ] }, { "cell_type": "code", "execution_count": 13, "id": "84572647-fa48-4807-b58f-eef4fb7650bc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0.63994726, 0.84832379, 0.14964075, 0.90726993, -0.69289057,\n", " 0.20401277, 0.46849198, 1.4259954 ],\n", " [-0.84488505, -1.12339636, -0.16054575, 0.53090156, -0.69289057,\n", " -0.68442195, -0.36506078, -0.19067191],\n", " [ 1.23388019, 1.94372388, -0.26394125, -1.28821221, -0.69289057,\n", " -1.10325546, 0.60439732, -0.10558415]])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_scalered[:3]" ] }, { "cell_type": "code", "execution_count": 16, "id": "eea39bb8-93dd-4bcd-b509-10cfff7b5fbb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(500, 268)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(y[y==0]), len(y[y==1])#数据还算平衡" ] }, { "cell_type": "code", "execution_count": 19, "id": "725d9479-9a33-4d8b-93b5-256ebef8c078", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.536" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(y[y==1])/len(y[y==0])" ] }, { "cell_type": "code", "execution_count": 30, "id": "9983a9ff-6aff-4f2e-8d3a-4dfe52c0a580", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 31, "id": "2b049926-9155-40c9-803e-a9f8a75da355", "metadata": {}, "outputs": [], "source": [ "x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.3)#stratify使拆分后比例一样" ] }, { "cell_type": "code", "execution_count": 32, "id": "d36e5561-5ded-473d-9161-59ee4e2f7755", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.5342857142857143" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(y_train[y_train==1])/len(y_train[y_train==0])" ] }, { "cell_type": "code", "execution_count": 38, "id": "e8c4fdb1-2730-46c5-99f8-d2cb4c6d1d19", "metadata": {}, "outputs": [], "source": [ "from sklearn.tree import DecisionTreeClassifier" ] }, { "cell_type": "code", "execution_count": 39, "id": "f4cdd4a2-b7db-4e0c-bb7e-436796b38f77", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import cross_val_score" ] }, { "cell_type": "code", "execution_count": 41, "id": "62495661-ca1e-43d4-9da7-2591acfcf639", "metadata": {}, "outputs": [], "source": [ "cvs = cross_val_score(DecisionTreeClassifier(), x, y, cv=5)" ] }, { "cell_type": "code", "execution_count": 42, "id": "3d10246e-3e29-4037-a166-10b365381b29", "metadata": {}, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": 43, "id": "0be2cea0-bfca-44dd-85d3-85de1e14355e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.7110432051608522" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean(cvs)" ] }, { "cell_type": "code", "execution_count": 44, "id": "da9f0cde-1b1f-45b2-9d18-50de178bdd65", "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import BaggingClassifier" ] }, { "cell_type": "code", "execution_count": 45, "id": "0a7f52d2-f648-4062-946b-f178823bb29d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
BaggingClassifier(estimator=DecisionTreeClassifier(), max_samples=0.8,\n",
       "                  n_estimators=100, oob_score=True, random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "BaggingClassifier(estimator=DecisionTreeClassifier(), max_samples=0.8,\n", " n_estimators=100, oob_score=True, random_state=0)" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bagg = BaggingClassifier(estimator=DecisionTreeClassifier(),\n", " n_estimators=100,\n", " max_samples=0.8,\n", " oob_score=True,\n", " random_state=0\n", " )\n", "bagg.fit(x_train, y_train)" ] }, { "cell_type": "code", "execution_count": 46, "id": "0e94db07-919f-4911-8c37-0516649628f8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.7392923649906891" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bagg.oob_score_" ] }, { "cell_type": "code", "execution_count": 47, "id": "6ed41070-4666-4d84-b5c7-3be05809aec9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.7662337662337663" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bagg.score(x_test, y_test)" ] }, { "cell_type": "code", "execution_count": 49, "id": "75149b99-951c-4566-b039-ee7142886a97", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.7578728461081402" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bagg = BaggingClassifier(estimator=DecisionTreeClassifier(),\n", " n_estimators=100,\n", " max_samples=0.8,\n", " oob_score=True,\n", " random_state=0\n", " )\n", "np.mean(cross_val_score(bagg, x, y, cv=5))" ] }, { "cell_type": "code", "execution_count": 50, "id": "fdec9f7e-4b83-462f-ae41-0aeb1a20da7a", "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier" ] }, { "cell_type": "code", "execution_count": 55, "id": "1ca4c92b-d7c1-4c5e-b2fa-f5331ca4ccd5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.7657074951192598" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cross_val_score(RandomForestClassifier(), x, y, cv=5).mean()#随机森林就是bagging" ] }, { "cell_type": "code", "execution_count": null, "id": "08f53b7a-a348-4e12-83e1-3d04d43d7907", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 5 }