You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

476 lines
12 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Category</th>\n",
" <th>Message</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ham</td>\n",
" <td>Go until jurong point, crazy.. Available only ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ham</td>\n",
" <td>Ok lar... Joking wif u oni...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>spam</td>\n",
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ham</td>\n",
" <td>U dun say so early hor... U c already then say...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ham</td>\n",
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Category Message\n",
"0 ham Go until jurong point, crazy.. Available only ...\n",
"1 ham Ok lar... Joking wif u oni...\n",
"2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n",
"3 ham U dun say so early hor... U c already then say...\n",
"4 ham Nah I don't think he goes to usf, he lives aro..."
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"spam.csv\")\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe thead tr:last-of-type th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th colspan=\"4\" halign=\"left\">Message</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>unique</th>\n",
" <th>top</th>\n",
" <th>freq</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Category</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>ham</th>\n",
" <td>4825</td>\n",
" <td>4516</td>\n",
" <td>Sorry, I'll call later</td>\n",
" <td>30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>spam</th>\n",
" <td>747</td>\n",
" <td>641</td>\n",
" <td>Please call our customer service representativ...</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Message \\\n",
" count unique top \n",
"Category \n",
"ham 4825 4516 Sorry, I'll call later \n",
"spam 747 641 Please call our customer service representativ... \n",
"\n",
" \n",
" freq \n",
"Category \n",
"ham 30 \n",
"spam 4 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('Category').describe()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Category</th>\n",
" <th>Message</th>\n",
" <th>spam</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ham</td>\n",
" <td>Go until jurong point, crazy.. Available only ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ham</td>\n",
" <td>Ok lar... Joking wif u oni...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>spam</td>\n",
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ham</td>\n",
" <td>U dun say so early hor... U c already then say...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ham</td>\n",
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Category Message spam\n",
"0 ham Go until jurong point, crazy.. Available only ... 0\n",
"1 ham Ok lar... Joking wif u oni... 0\n",
"2 spam Free entry in 2 a wkly comp to win FA Cup fina... 1\n",
"3 ham U dun say so early hor... U c already then say... 0\n",
"4 ham Nah I don't think he goes to usf, he lives aro... 0"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"X_train, X_test, y_train, y_test = train_test_split(df.Message,df.spam)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0, 0, 0, ..., 0, 0, 0],\n",
" [0, 0, 0, ..., 0, 0, 0]], dtype=int64)"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"v = CountVectorizer()\n",
"X_train_count = v.fit_transform(X_train.values)\n",
"X_train_count.toarray()[:2]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.naive_bayes import MultinomialNB\n",
"model = MultinomialNB()\n",
"model.fit(X_train_count,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1], dtype=int64)"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"emails = [\n",
" 'Hey mohan, can we get together to watch footbal game tomorrow?',\n",
" 'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'\n",
"]\n",
"emails_count = v.transform(emails)\n",
"model.predict(emails_count)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9827709978463748"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_count = v.transform(X_test)\n",
"model.score(X_test_count, y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Sklearn Pipeline**"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.pipeline import Pipeline\n",
"clf = Pipeline([\n",
" ('vectorizer', CountVectorizer()),\n",
" ('nb', MultinomialNB())\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(memory=None,\n",
" steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
" tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9827709978463748"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.score(X_test,y_test)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1], dtype=int64)"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.predict(emails)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}