{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Importing the Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import tensorflow as tf\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense\n",
    "from keras.utils import to_categorical \n",
    "from sklearn import preprocessing\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from custom_class_data_processing import data_processing_classification #import the custom class for data_processsing\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Importing the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>workclass</th>\n",
       "      <th>fnlwgt</th>\n",
       "      <th>education</th>\n",
       "      <th>education-num</th>\n",
       "      <th>marital-status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>sex</th>\n",
       "      <th>capital-gain</th>\n",
       "      <th>capital-loss</th>\n",
       "      <th>hours-per-week</th>\n",
       "      <th>native-country</th>\n",
       "      <th>income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>19</td>\n",
       "      <td>Private</td>\n",
       "      <td>168294</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Craft-repair</td>\n",
       "      <td>Own-child</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>47</td>\n",
       "      <td>Self-emp-inc</td>\n",
       "      <td>109832</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>60</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>35</td>\n",
       "      <td>Private</td>\n",
       "      <td>56352</td>\n",
       "      <td>Assoc-voc</td>\n",
       "      <td>11</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Other-service</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>Puerto-Rico</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>41</td>\n",
       "      <td>Private</td>\n",
       "      <td>147372</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>48</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>29</td>\n",
       "      <td>Private</td>\n",
       "      <td>105598</td>\n",
       "      <td>Some-college</td>\n",
       "      <td>10</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Tech-support</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>58</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   age     workclass  fnlwgt     education  education-num      marital-status  \\\n",
       "0   19       Private  168294       HS-grad              9       Never-married   \n",
       "1   47  Self-emp-inc  109832       HS-grad              9            Divorced   \n",
       "2   35       Private   56352     Assoc-voc             11  Married-civ-spouse   \n",
       "3   41       Private  147372       HS-grad              9  Married-civ-spouse   \n",
       "4   29       Private  105598  Some-college             10            Divorced   \n",
       "\n",
       "        occupation   relationship   race   sex  capital-gain  capital-loss  \\\n",
       "0     Craft-repair      Own-child  White  Male             0             0   \n",
       "1  Exec-managerial  Not-in-family  White  Male             0             0   \n",
       "2    Other-service        Husband  White  Male             0             0   \n",
       "3     Adm-clerical        Husband  White  Male             0             0   \n",
       "4     Tech-support  Not-in-family  White  Male             0             0   \n",
       "\n",
       "   hours-per-week native-country income  \n",
       "0              40  United-States  <=50K  \n",
       "1              60  United-States  <=50K  \n",
       "2              40    Puerto-Rico  <=50K  \n",
       "3              48  United-States  <=50K  \n",
       "4              58  United-States  <=50K  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Importing the data \n",
    "data = pd.read_csv(\"adult_2500.csv\")\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data preprocessing - Filter the numerical columns and normalize the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Splitting the data in train and test sets\n",
    "\n",
    "X = data.loc[:, data.columns != 'income']  \n",
    "y = data[[\"income\"]]\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(Empty DataFrame\n",
       " Columns: []\n",
       " Index: [],\n",
       " [])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_train = pd.concat([X_train, y_train], axis=1)\n",
    "data_test =  pd.concat([X_test, y_test], axis=1)\n",
    "\n",
    "class_instance = data_processing_classification(\"income\") #creating an instance of the data preprocessing class defined above\n",
    "class_instance.pre_process(data_train, train=True) # learning the normaliation parameters as per X_train and traget enconding as per y_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'target_column': 'income',\n",
       " 'count_classes': 2,\n",
       " 'std_scale': StandardScaler(),\n",
       " 'target_encoded_categorical':     income  Target_Class\n",
       " 308  <=50K             0\n",
       " 59    >50K             1,\n",
       " 'count_features': 6,\n",
       " 'feature_names': Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',\n",
       "        'hours-per-week'],\n",
       "       dtype='object')}"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#check the attributes of the instance\n",
    "class_instance.__dict__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preprocessing the train and the test data\n",
    "data_train_preprocessed, y_train = class_instance.pre_process(data_train,norm=True,class_label=True,train=False)\n",
    "data_test_preprocessed, y_test = class_instance.pre_process(data_test,norm=True,class_label=True,train=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train the Keras Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"model_1\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input_2 (InputLayer)         [(None, 6)]               0         \n",
      "_________________________________________________________________\n",
      "dense_3 (Dense)              (None, 64)                448       \n",
      "_________________________________________________________________\n",
      "dense_4 (Dense)              (None, 32)                2080      \n",
      "_________________________________________________________________\n",
      "dense_5 (Dense)              (None, 2)                 66        \n",
      "=================================================================\n",
      "Total params: 2,594\n",
      "Trainable params: 2,594\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "inputs = tf.keras.Input(shape=(6))\n",
    "x1 = tf.keras.layers.Dense(64, activation=tf.nn.relu)(inputs)\n",
    "x2 = tf.keras.layers.Dense(32, activation=tf.nn.relu)(x1)\n",
    "outputs = tf.keras.layers.Dense(2, activation=tf.nn.softmax)(x2)\n",
    "model = tf.keras.Model(inputs=inputs, outputs=outputs)\n",
    "\n",
    "model.summary()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/5\n",
      "5/5 [==============================] - 0s 2ms/step - loss: 0.6978 - accuracy: 0.4143\n",
      "Epoch 2/5\n",
      "5/5 [==============================] - 0s 2ms/step - loss: 0.6947 - accuracy: 0.5200\n",
      "Epoch 3/5\n",
      "5/5 [==============================] - 0s 2ms/step - loss: 0.6941 - accuracy: 0.6113\n",
      "Epoch 4/5\n",
      "5/5 [==============================] - 0s 1ms/step - loss: 0.6940 - accuracy: 0.5672\n",
      "Epoch 5/5\n",
      "5/5 [==============================] - 0s 2ms/step - loss: 0.6938 - accuracy: 0.5224\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.keras.callbacks.History at 0x238dc5dc508>"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Compile the model\n",
    "model.compile(optimizer='adam', \n",
    "              loss='binary_crossentropy', \n",
    "              metrics=['accuracy'])\n",
    "\n",
    "# build the model\n",
    "model.fit(data_train_preprocessed, np.asarray(y_train), epochs=5, batch_size= 400)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_prediction = model.predict(data_test_preprocessed)\n",
    "y_pred = []\n",
    "\n",
    "for cnt in range(len(y_prediction)):\n",
    "    if  y_prediction[cnt,1] < 0.5:\n",
    "        y_pred.append(0)\n",
    "    else:\n",
    "        y_pred.append(1)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "        <50K       0.75      0.48      0.59       629\n",
      "        >50K       0.23      0.49      0.31       196\n",
      "\n",
      "    accuracy                           0.49       825\n",
      "   macro avg       0.49      0.49      0.45       825\n",
      "weighted avg       0.63      0.49      0.52       825\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import classification_report\n",
    "target_names = ['<50K', '>50K']\n",
    "\n",
    "print(classification_report(y_test, y_pred, target_names=target_names))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save the model and pickle the prepocessing objects"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "model.save(\"DL_model.h5\")\n",
    "\n",
    "#Pickle the preprocessing object\n",
    "filename = 'data_preprocess.pkl'\n",
    "pickle.dump(class_instance, open(filename, 'wb'))\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}