You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
476 lines
12 KiB
Plaintext
476 lines
12 KiB
Plaintext
6 months ago
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import pandas as pd"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>Category</th>\n",
|
||
|
" <th>Message</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>0</th>\n",
|
||
|
" <td>ham</td>\n",
|
||
|
" <td>Go until jurong point, crazy.. Available only ...</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>1</th>\n",
|
||
|
" <td>ham</td>\n",
|
||
|
" <td>Ok lar... Joking wif u oni...</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>2</th>\n",
|
||
|
" <td>spam</td>\n",
|
||
|
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>ham</td>\n",
|
||
|
" <td>U dun say so early hor... U c already then say...</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>ham</td>\n",
|
||
|
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" Category Message\n",
|
||
|
"0 ham Go until jurong point, crazy.. Available only ...\n",
|
||
|
"1 ham Ok lar... Joking wif u oni...\n",
|
||
|
"2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n",
|
||
|
"3 ham U dun say so early hor... U c already then say...\n",
|
||
|
"4 ham Nah I don't think he goes to usf, he lives aro..."
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 2,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df = pd.read_csv(\"spam.csv\")\n",
|
||
|
"df.head()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 3,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead tr th {\n",
|
||
|
" text-align: left;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead tr:last-of-type th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr>\n",
|
||
|
" <th></th>\n",
|
||
|
" <th colspan=\"4\" halign=\"left\">Message</th>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>count</th>\n",
|
||
|
" <th>unique</th>\n",
|
||
|
" <th>top</th>\n",
|
||
|
" <th>freq</th>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>Category</th>\n",
|
||
|
" <th></th>\n",
|
||
|
" <th></th>\n",
|
||
|
" <th></th>\n",
|
||
|
" <th></th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>ham</th>\n",
|
||
|
" <td>4825</td>\n",
|
||
|
" <td>4516</td>\n",
|
||
|
" <td>Sorry, I'll call later</td>\n",
|
||
|
" <td>30</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>spam</th>\n",
|
||
|
" <td>747</td>\n",
|
||
|
" <td>641</td>\n",
|
||
|
" <td>Please call our customer service representativ...</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" Message \\\n",
|
||
|
" count unique top \n",
|
||
|
"Category \n",
|
||
|
"ham 4825 4516 Sorry, I'll call later \n",
|
||
|
"spam 747 641 Please call our customer service representativ... \n",
|
||
|
"\n",
|
||
|
" \n",
|
||
|
" freq \n",
|
||
|
"Category \n",
|
||
|
"ham 30 \n",
|
||
|
"spam 4 "
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 3,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.groupby('Category').describe()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 4,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>Category</th>\n",
|
||
|
" <th>Message</th>\n",
|
||
|
" <th>spam</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>0</th>\n",
|
||
|
" <td>ham</td>\n",
|
||
|
" <td>Go until jurong point, crazy.. Available only ...</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>1</th>\n",
|
||
|
" <td>ham</td>\n",
|
||
|
" <td>Ok lar... Joking wif u oni...</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>2</th>\n",
|
||
|
" <td>spam</td>\n",
|
||
|
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>ham</td>\n",
|
||
|
" <td>U dun say so early hor... U c already then say...</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>ham</td>\n",
|
||
|
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" Category Message spam\n",
|
||
|
"0 ham Go until jurong point, crazy.. Available only ... 0\n",
|
||
|
"1 ham Ok lar... Joking wif u oni... 0\n",
|
||
|
"2 spam Free entry in 2 a wkly comp to win FA Cup fina... 1\n",
|
||
|
"3 ham U dun say so early hor... U c already then say... 0\n",
|
||
|
"4 ham Nah I don't think he goes to usf, he lives aro... 0"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 4,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)\n",
|
||
|
"df.head()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 7,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from sklearn.model_selection import train_test_split\n",
|
||
|
"X_train, X_test, y_train, y_test = train_test_split(df.Message,df.spam)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 31,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"array([[0, 0, 0, ..., 0, 0, 0],\n",
|
||
|
" [0, 0, 0, ..., 0, 0, 0]], dtype=int64)"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 31,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||
|
"v = CountVectorizer()\n",
|
||
|
"X_train_count = v.fit_transform(X_train.values)\n",
|
||
|
"X_train_count.toarray()[:2]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 23,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 23,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"from sklearn.naive_bayes import MultinomialNB\n",
|
||
|
"model = MultinomialNB()\n",
|
||
|
"model.fit(X_train_count,y_train)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 37,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"array([0, 1], dtype=int64)"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 37,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"emails = [\n",
|
||
|
" 'Hey mohan, can we get together to watch footbal game tomorrow?',\n",
|
||
|
" 'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'\n",
|
||
|
"]\n",
|
||
|
"emails_count = v.transform(emails)\n",
|
||
|
"model.predict(emails_count)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 38,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"0.9827709978463748"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 38,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"X_test_count = v.transform(X_test)\n",
|
||
|
"model.score(X_test_count, y_test)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"**Sklearn Pipeline**"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 39,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from sklearn.pipeline import Pipeline\n",
|
||
|
"clf = Pipeline([\n",
|
||
|
" ('vectorizer', CountVectorizer()),\n",
|
||
|
" ('nb', MultinomialNB())\n",
|
||
|
"])"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 40,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"Pipeline(memory=None,\n",
|
||
|
" steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
|
||
|
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
|
||
|
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
|
||
|
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
|
||
|
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
|
||
|
" tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 40,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"clf.fit(X_train, y_train)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 41,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"0.9827709978463748"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 41,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"clf.score(X_test,y_test)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 42,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"array([0, 1], dtype=int64)"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 42,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"clf.predict(emails)"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.7.3"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|