{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CategoryMessage
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
\n", "
" ], "text/plain": [ " Category Message\n", "0 ham Go until jurong point, crazy.. Available only ...\n", "1 ham Ok lar... Joking wif u oni...\n", "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n", "3 ham U dun say so early hor... U c already then say...\n", "4 ham Nah I don't think he goes to usf, he lives aro..." ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"spam.csv\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Message
countuniquetopfreq
Category
ham48254516Sorry, I'll call later30
spam747641Please call our customer service representativ...4
\n", "
" ], "text/plain": [ " Message \\\n", " count unique top \n", "Category \n", "ham 4825 4516 Sorry, I'll call later \n", "spam 747 641 Please call our customer service representativ... \n", "\n", " \n", " freq \n", "Category \n", "ham 30 \n", "spam 4 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby('Category').describe()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CategoryMessagespam
0hamGo until jurong point, crazy.. Available only ...0
1hamOk lar... Joking wif u oni...0
2spamFree entry in 2 a wkly comp to win FA Cup fina...1
3hamU dun say so early hor... U c already then say...0
4hamNah I don't think he goes to usf, he lives aro...0
\n", "
" ], "text/plain": [ " Category Message spam\n", "0 ham Go until jurong point, crazy.. Available only ... 0\n", "1 ham Ok lar... Joking wif u oni... 0\n", "2 spam Free entry in 2 a wkly comp to win FA Cup fina... 1\n", "3 ham U dun say so early hor... U c already then say... 0\n", "4 ham Nah I don't think he goes to usf, he lives aro... 0" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(df.Message,df.spam)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0]], dtype=int64)" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "v = CountVectorizer()\n", "X_train_count = v.fit_transform(X_train.values)\n", "X_train_count.toarray()[:2]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.naive_bayes import MultinomialNB\n", "model = MultinomialNB()\n", "model.fit(X_train_count,y_train)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 1], dtype=int64)" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "emails = [\n", " 'Hey mohan, can we get together to watch footbal game tomorrow?',\n", " 'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'\n", "]\n", "emails_count = v.transform(emails)\n", "model.predict(emails_count)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9827709978463748" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_count = v.transform(X_test)\n", "model.score(X_test_count, y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Sklearn Pipeline**" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "clf = Pipeline([\n", " ('vectorizer', CountVectorizer()),\n", " ('nb', MultinomialNB())\n", "])" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pipeline(memory=None,\n", " steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", " dtype=, encoding='utf-8', input='content',\n", " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", " strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", " tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9827709978463748" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.score(X_test,y_test)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 1], dtype=int64)" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.predict(emails)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }