{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Importing the Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn import preprocessing\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from custom_class_data_processing import data_processing_classification #import the custom class for data_processsing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Importing the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>workclass</th>\n",
       "      <th>fnlwgt</th>\n",
       "      <th>education</th>\n",
       "      <th>education-num</th>\n",
       "      <th>marital-status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>sex</th>\n",
       "      <th>capital-gain</th>\n",
       "      <th>capital-loss</th>\n",
       "      <th>hours-per-week</th>\n",
       "      <th>native-country</th>\n",
       "      <th>income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>19</td>\n",
       "      <td>Private</td>\n",
       "      <td>168294</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Craft-repair</td>\n",
       "      <td>Own-child</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>47</td>\n",
       "      <td>Self-emp-inc</td>\n",
       "      <td>109832</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>60</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>35</td>\n",
       "      <td>Private</td>\n",
       "      <td>56352</td>\n",
       "      <td>Assoc-voc</td>\n",
       "      <td>11</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Other-service</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>Puerto-Rico</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>41</td>\n",
       "      <td>Private</td>\n",
       "      <td>147372</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>48</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>29</td>\n",
       "      <td>Private</td>\n",
       "      <td>105598</td>\n",
       "      <td>Some-college</td>\n",
       "      <td>10</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Tech-support</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>58</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   age     workclass  fnlwgt     education  education-num      marital-status  \\\n",
       "0   19       Private  168294       HS-grad              9       Never-married   \n",
       "1   47  Self-emp-inc  109832       HS-grad              9            Divorced   \n",
       "2   35       Private   56352     Assoc-voc             11  Married-civ-spouse   \n",
       "3   41       Private  147372       HS-grad              9  Married-civ-spouse   \n",
       "4   29       Private  105598  Some-college             10            Divorced   \n",
       "\n",
       "        occupation   relationship   race   sex  capital-gain  capital-loss  \\\n",
       "0     Craft-repair      Own-child  White  Male             0             0   \n",
       "1  Exec-managerial  Not-in-family  White  Male             0             0   \n",
       "2    Other-service        Husband  White  Male             0             0   \n",
       "3     Adm-clerical        Husband  White  Male             0             0   \n",
       "4     Tech-support  Not-in-family  White  Male             0             0   \n",
       "\n",
       "   hours-per-week native-country income  \n",
       "0              40  United-States  <=50K  \n",
       "1              60  United-States  <=50K  \n",
       "2              40    Puerto-Rico  <=50K  \n",
       "3              48  United-States  <=50K  \n",
       "4              58  United-States  <=50K  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Importing the data \n",
    "data = pd.read_csv(\"adult_2500.csv\")\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data preprocessing - Filter the numerical columns and normalize the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Splitting the data in train and test sets\n",
    "\n",
    "X = data.loc[:, data.columns != 'income']  \n",
    "y = data[[\"income\"]]\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(Empty DataFrame\n",
       " Columns: []\n",
       " Index: [],\n",
       " [])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_train = pd.concat([X_train, y_train], axis=1)\n",
    "data_test =  pd.concat([X_test, y_test], axis=1)\n",
    "\n",
    "\n",
    "class_instance = data_processing_classification(\"income\") #creating an instance of the data preprocessing class defined above\n",
    "class_instance.pre_process(data_train, train=True) # learning the normaliation parameters as per X_train and traget enconding as per y_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'target_column': 'income',\n",
       " 'count_classes': 2,\n",
       " 'std_scale': StandardScaler(),\n",
       " 'target_encoded_categorical':     income  Target_Class\n",
       " 308  <=50K             0\n",
       " 59    >50K             1,\n",
       " 'count_features': 6,\n",
       " 'feature_names': Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',\n",
       "        'hours-per-week'],\n",
       "       dtype='object')}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#check the attributes of the instance\n",
    "class_instance.__dict__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preprocessing the train and the test data\n",
    "data_train_preprocessed, y_train = class_instance.pre_process(data_train,norm=True,class_label=True,train=False)\n",
    "data_test_preprocessed, y_test = class_instance.pre_process(data_test,norm=True,class_label=True,train=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train the Neural Nets Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\mpatt\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\neural_network\\_multilayer_perceptron.py:617: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "MLPClassifier(hidden_layer_sizes=(30, 30, 30))"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mlp = MLPClassifier(hidden_layer_sizes=(30,30,30))\n",
    "mlp.fit(data_train_preprocessed, y_train)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "        <50K       0.84      0.95      0.89       629\n",
      "        >50K       0.71      0.40      0.51       196\n",
      "\n",
      "    accuracy                           0.82       825\n",
      "   macro avg       0.77      0.68      0.70       825\n",
      "weighted avg       0.80      0.82      0.80       825\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import classification_report\n",
    "target_names = ['<50K', '>50K']\n",
    "y_pred = mlp.predict(data_test_preprocessed)\n",
    "print(classification_report(y_test, y_pred, target_names=target_names))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pickle the model and the prepocessing objects"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "#Pickle the model\n",
    "filename = 'NeuralNets.pkl'\n",
    "pickle.dump(mlp, open(filename, 'wb'))\n",
    "\n",
    "\n",
    "#Pickle the preprocessing object\n",
    "filename = 'data_preprocess.pkl'\n",
    "pickle.dump(class_instance, open(filename, 'wb'))\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}