# Import Required Python Packages :

# Scientific and Data Manipulation Libraries :

import numpy as np
import pandas as pd

# Data Viz & Regular Expression Libraries :

import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

# Scikit-Learn ML Libraries :

from sklearn.preprocessing import *
from sklearn.model_selection import *
from sklearn.metrics import *

# Garbage Collection Libraries :

import gc

# Boosting Algorithm Libraries :

from xgboost                          import XGBClassifier
from catboost                         import CatBoostClassifier
from lightgbm                         import LGBMClassifier
from sklearn.ensemble                 import RandomForestClassifier, VotingClassifier
from sklearn.metrics                  import accuracy_score
from sklearn.model_selection          import StratifiedKFold,KFold


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/customer/Test.csv
/kaggle/input/customer/Train.csv
/kaggle/input/customer/sample_submission.csv


# Load data 

train = pd.read_csv('../input/customer/Train.csv')
test = pd.read_csv('../input/customer/Test.csv')
sub = pd.read_csv('../input/customer/sample_submission.csv')


# Python Function 1 : Displays Data Information :

def display_data_information(data, data_types, dataframe_name):
    print(" Information of ",dataframe_name,": Rows = ",data.shape[0],"| Columns = ",data.shape[1],"\n")
    data.info()
    print("\n")
    for VARIABLE in data_types :
        data_type = data.select_dtypes(include=[ VARIABLE ]).dtypes
        if len(data_type) > 0 :
            print(str(len(data_type))+" "+VARIABLE+" Features\n"+str(data_type)+"\n"  )        

# Display Data Information of "train" :

data_types  = ["float32","float64","int32","int64","object","category","datetime64[ns]"]
display_data_information(train, data_types, "train")

 Information of  train : Rows =  8068 | Columns =  11 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8068 non-null   int64  
 1   Gender           8068 non-null   object 
 2   Ever_Married     7928 non-null   object 
 3   Age              8068 non-null   int64  
 4   Graduated        7990 non-null   object 
 5   Profession       7944 non-null   object 
 6   Work_Experience  7239 non-null   float64
 7   Spending_Score   8068 non-null   object 
 8   Family_Size      7733 non-null   float64
 9   Var_1            7992 non-null   object 
 10  Segmentation     8068 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 693.5+ KB


2 float64 Features
Work_Experience    float64
Family_Size        float64
dtype: object

2 int64 Features
ID     int64
Age    int64
dtype: object

7 object Features
Gender            object
Ever_Married      object
Graduated         object
Profession        object
Spending_Score    object
Var_1             object
Segmentation      object
dtype: object


# Display Data Information of "test" :

display_data_information(test, data_types, "test")

 Information of  test : Rows =  2627 | Columns =  10 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2627 entries, 0 to 2626
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               2627 non-null   int64  
 1   Gender           2627 non-null   object 
 2   Ever_Married     2577 non-null   object 
 3   Age              2627 non-null   int64  
 4   Graduated        2603 non-null   object 
 5   Profession       2589 non-null   object 
 6   Work_Experience  2358 non-null   float64
 7   Spending_Score   2627 non-null   object 
 8   Family_Size      2514 non-null   float64
 9   Var_1            2595 non-null   object 
dtypes: float64(2), int64(2), object(6)
memory usage: 205.4+ KB


2 float64 Features
Work_Experience    float64
Family_Size        float64
dtype: object

2 int64 Features
ID     int64
Age    int64
dtype: object

6 object Features
Gender            object
Ever_Married      object
Graduated         object
Profession        object
Spending_Score    object
Var_1             object
dtype: object


# Python Function 2 : Displays Data Head (Top Rows) and Tail (Bottom Rows) of the Dataframe (Table) :

def display_head_tail(data, head_rows, tail_rows):
    display("Data Head & Tail :")
    display(data.head(head_rows).append(data.tail(tail_rows)))
#     return True

# Displays Data Head (Top Rows) and Tail (Bottom Rows) of the Dataframe (Table)
# Pass Dataframe as "train", No. of Rows in Head = 3 and No. of Rows in Tail = 2 :

display_head_tail(train, head_rows=3, tail_rows=2)

'Data Head & Tail :'


display_head_tail(test, head_rows=3, tail_rows=2)

'Data Head & Tail :'


# Python Function 3 : Displays Data Description using Statistics :

def display_data_description(data, numeric_data_types, categorical_data_types):
    
    print("Data Description :")
    display(data.describe( include = numeric_data_types))
    print("")
    display(data.describe( include = categorical_data_types))

# Display Data Description of "train" :

display_data_description(train, data_types[0:4], data_types[4:7])

Data Description :


# Display Data Description of "test" :

display_data_description(test, data_types[0:4], data_types[4:7])

Data Description :


# Python Function 4 - Checking Percentage(%) of Common ID's between train and test data using Unique train values :

def data_leak_check
    print(np.intersect1d(train['ID'], test['ID']).shape[0]/train['ID'].nunique())
    common_ids = len(set(test['ID'].unique()).intersection(set(train['ID'].unique())))
    print("Unique IDs : ",test.shape[0] - common_ids)
    print("Common IDs : ",common_ids)

# Data Leak Check between Train and Test data

data_leak_check(train,test)

0.2890431333663857
Common IDs :  2332
Unique IDs :  295


testx = pd.merge(test,train,how='left', on = 'ID')


# Python Function 5 : Removes Data Duplicates while Retaining the First one - Similar to SQL DISTINCT :

def remove_duplicate(data):
    
    print("BEFORE REMOVING DUPLICATES - No. of Rows = ",data.shape[0])
    data.drop_duplicates(keep="first", inplace=True) 
    print("AFTER REMOVING DUPLICATES  - No. of Rows = ",data.shape[0])
    
    return data

# Remove Duplicates from "train" data :

train = remove_duplicate(train)

# No Duplicates at all in this dataset!

BEFORE REMOVING DUPLICATES - No. of Rows =  8068
AFTER REMOVING DUPLICATES  - No. of Rows =  8068


# Python Function 6 : Fills or Imputes Missing values with Various Methods : 

def fill_missing_values(data, fill_value, fill_types, columns, dataframe_name):
    
    print("Missing Values BEFORE REMOVAL in ",dataframe_name," data")
    display(data.isnull().sum())
    for column in columns :
        
        # Fill Missing Values with Specific Value :
        if "Value_Fill" in fill_types :
            data[ column ] = data[ column ].fillna(fill_value)
#             print("Value_Fill")

        # Fill Missing Values with Forward Fill  (Previous Row Value as Current Row in Table) :
        if "Forward_Fill" in fill_types :
            data[ column ] = data[ column ].ffill(axis = 0)
#             print("Forward_Fill")

        # Fill Missing Values with Backward Fill (Next Row Value as Current Row in Table) :
        if "Backward_Fill" in fill_types :
            data[ column ] = data[ column ].bfill(axis = 0)
#             print("Backward_Fill")
    
    # print("Missing Values AFTER REMOVAL in ",dataframe_name," data")
    # display(data.isnull().sum())
    
    return data

# Values are Value_Fill, Forward_Fill and Backward_Fill
fill_types = [ "Forward_Fill"]
fill_value = 0
# Fills or Imputes Missing values in "Registration_Date" Column with "Forward_Fill" Method in "train" : 
train = fill_missing_values(train, fill_value, fill_types, ["Registration_Date"],"train")

# Fills or Imputes Missing values in "Registration_Date" Column with "Forward_Fill" Method in "train" :
test  = fill_missing_values(test, fill_value, fill_types, ["Registration_Date"],"test")


# Python Function 7 : Displays Unique Values in Each Column of the Dataframe(Table) :

def display_unique(data):
    for column in data.columns :
        
        print("No of Unique Values in "+column+" Column are : "+str(data[column].nunique()))
        print("Actual Unique Values in "+column+" Column are : "+str(data[column].sort_values(ascending=True,na_position='last').unique() ))
        print("NULL Values :")
        print(data[ column ].isnull().sum())
        print("Value Counts :")
        print(data[column].value_counts())
        print("")
        
# Displays Unique Values in Each Column of "train" :
# Check "train" data for Values of each Column - Long Form :

display_unique(train)

# Display this info in a Table Format - Improvements coming In Part 2

No of Unique Values in ID Column are : 8068
Actual Unique Values in ID Column are : [458982 458983 458984 ... 467972 467973 467974]
NULL Values :
0
Value Counts :
460799    1
466217    1
460042    1
462091    1
464144    1
         ..
463511    1
467609    1
461468    1
459421    1
460801    1
Name: ID, Length: 8068, dtype: int64

No of Unique Values in Gender Column are : 2
Actual Unique Values in Gender Column are : ['Female' 'Male']
NULL Values :
0
Value Counts :
Male      4417
Female    3651
Name: Gender, dtype: int64

No of Unique Values in Ever_Married Column are : 2
Actual Unique Values in Ever_Married Column are : ['No' 'Yes' nan]
NULL Values :
140
Value Counts :
Yes    4643
No     3285
Name: Ever_Married, dtype: int64

No of Unique Values in Age Column are : 67
Actual Unique Values in Age Column are : [18 19 20 21 22 23 25 26 27 28 29 30 31 32 33 35 36 37 38 39 40 41 42 43
 45 46 47 48 49 50 51 52 53 55 56 57 58 59 60 61 62 63 65 66 67 68 69 70
 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89]
NULL Values :
0
Value Counts :
35    250
37    234
33    232
42    232
27    229
     ... 
78     29
87     28
76     27
80     24
85     22
Name: Age, Length: 67, dtype: int64

No of Unique Values in Graduated Column are : 2
Actual Unique Values in Graduated Column are : ['No' 'Yes' nan]
NULL Values :
78
Value Counts :
Yes    4968
No     3022
Name: Graduated, dtype: int64

No of Unique Values in Profession Column are : 9
Actual Unique Values in Profession Column are : ['Artist' 'Doctor' 'Engineer' 'Entertainment' 'Executive' 'Healthcare'
 'Homemaker' 'Lawyer' 'Marketing' nan]
NULL Values :
124
Value Counts :
Artist           2516
Healthcare       1332
Entertainment     949
Engineer          699
Doctor            688
Lawyer            623
Executive         599
Marketing         292
Homemaker         246
Name: Profession, dtype: int64

No of Unique Values in Work_Experience Column are : 15
Actual Unique Values in Work_Experience Column are : [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. nan]
NULL Values :
829
Value Counts :
1.0     2354
0.0     2318
9.0      474
8.0      463
2.0      286
3.0      255
4.0      253
6.0      204
7.0      196
5.0      194
10.0      53
11.0      50
12.0      48
13.0      46
14.0      45
Name: Work_Experience, dtype: int64

No of Unique Values in Spending_Score Column are : 3
Actual Unique Values in Spending_Score Column are : ['Average' 'High' 'Low']
NULL Values :
0
Value Counts :
Low        4878
Average    1974
High       1216
Name: Spending_Score, dtype: int64

No of Unique Values in Family_Size Column are : 9
Actual Unique Values in Family_Size Column are : [ 1.  2.  3.  4.  5.  6.  7.  8.  9. nan]
NULL Values :
335
Value Counts :
2.0    2390
3.0    1497
1.0    1453
4.0    1379
5.0     612
6.0     212
7.0      96
8.0      50
9.0      44
Name: Family_Size, dtype: int64

No of Unique Values in Var_1 Column are : 7
Actual Unique Values in Var_1 Column are : ['Cat_1' 'Cat_2' 'Cat_3' 'Cat_4' 'Cat_5' 'Cat_6' 'Cat_7' nan]
NULL Values :
76
Value Counts :
Cat_6    5238
Cat_4    1089
Cat_3     822
Cat_2     422
Cat_7     203
Cat_1     133
Cat_5      85
Name: Var_1, dtype: int64

No of Unique Values in Segmentation Column are : 4
Actual Unique Values in Segmentation Column are : ['A' 'B' 'C' 'D']
NULL Values :
0
Value Counts :
D    2268
A    1972
C    1970
B    1858
Name: Segmentation, dtype: int64


# Concatenate train and test data into single DataFrame - df :

train['is_train'] = 1
test['is_train'] = 0
df = pd.concat([train,test])


# Mapping Values to Label ENCODED Values :

df['Segmentation'] = df['Segmentation'].map({'A':0,'B':1,'C':2,'D':3})


# Get Back train data from df with a condition on column is_train == 1 :

train = df[df['is_train'] == 1]


# split train into 5 folds and apply random forest and check accuracy of each fold

predictor_train = train.drop(['Segmentation','is_train','ID'],axis=1)
target_train    = train['Segmentation']


predictor_test = test.drop(['is_train','ID'],axis=1)


# Python Function 8 - Apply various Encoding Techniques to the data 

def data_encoding( encoding_strategy , encoding_data , encoding_columns ):
    
    if encoding_strategy == "LabelEncoding":
        Encoder = LabelEncoder()
        for column in encoding_columns :
            encoding_data[ column ] = Encoder.fit_transform(tuple(encoding_data[ column ]))
        
    elif encoding_strategy == "OneHotEncoding":
#         display(encoding_data[encoding_columns])
        encoding_data = pd.get_dummies( encoding_data  )
        
    elif encoding_strategy == "TargetEncoding":
        ## Code Coming soon
        print("TargetEncoding")

    else :
        encoding_data = pd.get_dummies( encoding_data[encoding_columns]  )
           
    return encoding_data

encoding_columns  = [ "Gender", "Ever_Married" , "Graduated", "Profession" , "Spending_Score", "Var_1" ]
encoding_strategy = [ "OneHotEncoding", "LabelEncoding", "TargetEncoding", "ELSE"]

predictor_train_encode = data_encoding( encoding_strategy[1] , predictor_train , encoding_columns )
predictor_test_encode  = data_encoding( encoding_strategy[1] , predictor_test ,  encoding_columns )


print("predictor_train_encode SHAPE   : ",predictor_train_encode.shape)
display("predictor_train_encode COLUMNS : ",predictor_train_encode.head())

print("predictor_test_encode SHAPE   : ",predictor_test_encode.shape)
display("predictor_test_encode COLUMNS : ",predictor_test_encode.head())

predictor_train_encode SHAPE   :  (8068, 9)

'predictor_train_encode COLUMNS : '

predictor_test_encode SHAPE   :  (2627, 9)

'predictor_test_encode COLUMNS : '


# Python Function 9 - Create Base line model based on user choice and perform cross validation
# Accuracy is used as metric here - we can change the same based on use case

def create_baseline_model(model):
        
    # Mention Categorical Values of the Light GBM Model to Handle :
    categorical_features = ["Gender", "Ever_Married" ,"Graduated" ,"Profession" ,"Spending_Score" ,"Var_1" ]

    lgb_model = model

    # Apply Stratified K-Fold Cross Validation where K=5 or n_splits=5 :
    kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=10)
    acc = []

    # Pass predictor_train,target_train for Cross Validation :
    for fold,(t_id,v_id) in enumerate(kf.split(predictor_train,target_train)):

        # Split train and validation data :
        tx = predictor_train.iloc[t_id]; ty = target_train.iloc[t_id]
        vx = predictor_train.iloc[v_id]; vy = target_train.iloc[v_id]

        # Train/Fit the Data to LighGBM Model :
        lgb_model.fit(tx,ty, categorical_feature = categorical_features )

        # Predict the Validation Data to Train LighGBM Model :
        val_y = lgb_model.predict(vx)

        # Get Accuracy Score on Validation Data for Each Fold :
        acc_score = accuracy_score(vy,val_y)
        acc.append(acc_score)
        print(f"fold {fold} accuracy {acc_score}")

    # Get Mean of Accuracy Score on Validation Data for All 5 Folds :
    print(f"Mean accuracy score {np.mean(acc)}")
    
# Choose any Tree based ML Model to pass here:
model = LGBMClassifier()
create_baseline_model(model)

/opt/conda/lib/python3.7/site-packages/lightgbm/basic.py:1295: UserWarning: categorical_feature in Dataset is overridden.
New categorical_feature is ['Ever_Married', 'Gender', 'Graduated', 'Profession', 'Spending_Score', 'Var_1']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))

fold 0 accuracy 0.533457249070632
fold 1 accuracy 0.5198265179677819
fold 2 accuracy 0.5285006195786865
fold 3 accuracy 0.5151890886546807
fold 4 accuracy 0.5561066336019839
Mean accuracy score 0.5306160217747531


# Tuned the Hyperparameters of LighGBM Classifier :
lgb_model = LGBMClassifier(
                                   boosting_type='gbdt', 
                                   max_depth=15, 
                                   learning_rate=0.15, 
                                   objective='multiclass', # Multi Class Classification
                                   random_state=100,  
                                   n_estimators=1000 ,
                                   reg_alpha=0, 
                                   reg_lambda=1, 
                                   n_jobs=-1
                                 )


# Call the Python Function 9 again with the Tuned ML Model:

create_baseline_model(lgb_model)

/opt/conda/lib/python3.7/site-packages/lightgbm/basic.py:1295: UserWarning: categorical_feature in Dataset is overridden.
New categorical_feature is ['Ever_Married', 'Gender', 'Graduated', 'Profession', 'Spending_Score', 'Var_1']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))

fold 0 accuracy 0.5006195786864932
fold 1 accuracy 0.4851301115241636
fold 2 accuracy 0.476456009913259
fold 3 accuracy 0.46745195288282704
fold 4 accuracy 0.49473031618102914
Mean accuracy score 0.48487759383755435


# Python Function 10 - used to train the model. predict and download csv of the predictions
def model_train_predict_submit( Classifiers_model_name, model_name ,X_train, y_train, X_test, target):
    
    categorical_features = ["Gender", "Ever_Married" ,"Graduated" ,"Profession" ,"Spending_Score" ,"Var_1"  ]
    Classifiers_model_name.fit( X_train, y_train , categorical_feature = categorical_features )
    final_predictions = Classifiers_model_name.predict( X_test )
    print(final_predictions)  
   
    Result_Promoted = pd.DataFrame({'ID': sub['ID'], target : final_predictions})
    Result_Promoted[ target ]=Result_Promoted[ target ].map({0: "A", 1: "B", 2: "C", 3: "D" })
    print(Result_Promoted[ target ].unique())
    Result_Promoted.to_csv(model_name +"_Labelling=Yes_Scaling=Yes"+".csv",index=False)
    return Result_Promoted

model_name       = "LGBM_Tuned_BEST"
model_classifier = lgb_model
sub = model_train_predict_submit( model_classifier, model_name, predictor_train_encode,target_train, predictor_test_encode, target = 'Segmentation')

[1. 2. 0. ... 0. 2. 3.]
['B' 'C' 'A' 'D']


sub1 = pd.merge(sub,testx,how='left',on='ID')
sub1.head()


sub['segmentation2'] = sub1['Segmentation_y']
sub.head()

	ID	Age	Work_Experience	Family_Size
count	8068.000000	8068.000000	7239.000000	7733.000000
mean	463479.214551	43.466906	2.641663	2.850123
std	2595.381232	16.711696	3.406763	1.531413
min	458982.000000	18.000000	0.000000	1.000000
25%	461240.750000	30.000000	0.000000	2.000000
50%	463472.500000	40.000000	1.000000	3.000000
75%	465744.250000	53.000000	4.000000	4.000000
max	467974.000000	89.000000	14.000000	9.000000

	ID	Age	Work_Experience	Family_Size
count	2627.000000	2627.000000	2358.000000	2514.000000
mean	463433.918919	43.649791	2.552587	2.825378
std	2618.245698	16.967015	3.341094	1.551906
min	458989.000000	18.000000	0.000000	1.000000
25%	461162.500000	30.000000	0.000000	2.000000
50%	463379.000000	41.000000	1.000000	2.000000
75%	465696.000000	53.000000	4.000000	4.000000
max	467968.000000	89.000000	14.000000	9.000000

	ID	Segmentation	segmentation2
0	458989	B	B
1	458994	C	C
2	458996	A	A
3	459000	C	C
4	459001	D	C

Top 10 Python Functions to Automate the Steps in Data Science¶

Steps for Applied Machine Learning (ML) for Hackathons :¶

1. Understand the Problem Statement & Import Packages and Datasets :¶

2. Perform EDA (Exploratory Data Analysis) - Understanding the Datasets :¶

2.1 Explore Train and Test Data and get to know what each Column / Feature denotes :¶

Python Function 1¶

Python Function 2¶

Python Function 3¶

Python Function 4¶

3. Remove Duplicate Rows from Train data if present :¶

Python Function 5¶

4. Fill/Impute Missing Values Continuous - Mean/Median/Any Specific Value & Categorical - Others/ForwardFill/BackFill :¶

Python Function 6¶

Python Function 7¶

Multi - Class Classification Problem - Target has more than 2 Categories -¶

Target - Segmentation has 4 Values of Customers ['D' 'A' 'B' 'C']¶

5. Feature Engineering¶

5.1 Feature Selection - Selection of Most Important Existing Features¶

5.2 Feature Creation - Creation of New Features from the Existing Features / Predictors :¶

6. Split Train Data into Train and Validation Data with Predictors(Independent) & Target(Dependent) :¶

7. Data Encoding - Label Encoding :¶

Python Function 8¶

8. Create Baseline ML Model :¶

Python Function 9¶

9. Improve ML Model,Fine Tune with MODEL Evaluation METRIC - "Accuracy" and Predict Target "Segmentation" :¶

Python Function 10¶

10. Result Submission, Check Leaderboard & Improve "ACCURACY" :¶

	ID	Gender	Ever_Married	Age	Graduated	Profession	Work_Experience	Spending_Score	Family_Size	Var_1	Segmentation
0	462809	Male	No	22	No	Healthcare	1.0	Low	4.0	Cat_4	D
1	462643	Female	Yes	38	Yes	Engineer	NaN	Average	3.0	Cat_4	A
2	466315	Female	Yes	67	Yes	Engineer	1.0	Low	1.0	Cat_6	B
8066	467299	Female	No	27	Yes	Healthcare	1.0	Low	4.0	Cat_6	B
8067	461879	Male	Yes	37	Yes	Executive	0.0	Average	3.0	Cat_4	B

	Gender	Ever_Married	Graduated	Profession	Spending_Score	Var_1	Segmentation
count	8068	7928	7990	7944	8068	7992	8068
unique	2	2	2	9	3	7	4
top	Male	Yes	Yes	Artist	Low	Cat_6	D
freq	4417	4643	4968	2516	4878	5238	2268

	Gender	Ever_Married	Graduated	Profession	Spending_Score	Var_1
count	2627	2577	2603	2589	2627	2595
unique	2	2	2	9	3	7
top	Male	Yes	Yes	Artist	Low	Cat_6
freq	1424	1520	1602	802	1616	1672

	Gender	Ever_Married	Age	Graduated	Profession	Work_Experience	Spending_Score	Family_Size	Var_1
0	1	0	22	0	5	1.0	2	4.0	3
1	0	1	38	1	2	NaN	0	3.0	3
2	0	1	67	1	2	1.0	2	1.0	5
3	1	1	67	1	7	0.0	1	2.0	5
4	0	1	40	1	3	NaN	1	6.0	5

	Gender	Ever_Married	Age	Graduated	Profession	Work_Experience	Spending_Score	Family_Size	Var_1
0	0	1	36	1	2	0.0	2	1.0	5
1	1	1	37	1	5	8.0	0	4.0	5
2	0	1	69	0	9	0.0	2	1.0	5
3	1	1	59	0	4	11.0	1	2.0	5
4	0	0	19	0	8	NaN	2	4.0	5

	Gender	Ever_Married	Age	Graduated	Profession	Work_Experience	Spending_Score	Family_Size	Var_1
0	1	0	22	0	5	1.0	2	4.0	3
1	0	1	38	1	2	NaN	0	3.0	3
2	0	1	67	1	2	1.0	2	1.0	5
3	1	1	67	1	7	0.0	1	2.0	5
4	0	1	40	1	3	NaN	1	6.0	5

	Gender	Ever_Married	Age	Graduated	Profession	Work_Experience	Spending_Score	Family_Size	Var_1
0	1	0	22	0	5	1.0	2	4.0	3
1	0	1	38	1	2	NaN	0	3.0	3
2	0	1	67	1	2	1.0	2	1.0	5
3	1	1	67	1	7	0.0	1	2.0	5
4	0	1	40	1	3	NaN	1	6.0	5