码迷,mamicode.com
首页 > 其他好文 > 详细

kaggle Titanic

时间:2018-04-23 16:34:28      阅读:200      评论:0      收藏:0      [点我收藏+]

标签:sse   import   strategy   pass   roc   dataframe   col   桌面   ber   

# coding: utf-8

# In[19]:

# 0.78468


# In[20]:


import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(ignore)
from sklearn import preprocessing


# In[21]:


train_path = rC:\Users\cbattle\Desktop\train.csv # r‘/home/adminn/桌面/train.csv‘ 
test_path = rC:\Users\cbattle\Desktop\test.csv # r‘/home/adminn/桌面/test.csv‘
out_path = rC:\Users\cbattle\Desktop\out.csv # r‘/home/adminn/桌面/out.csv‘

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

print(train:,train.shape)
print(test:,test.shape)
# train.info()
# test.info()
# print(train.head())

# 属性列
# print([col for col in train])
# print([col for col in test])

# 策略
# [‘PassengerId‘, ‘Pclass‘, ‘Name‘, ‘Sex‘, ‘Age‘, ‘SibSp‘, ‘Parch‘, ‘Ticket‘, ‘Fare‘, ‘Cabin‘, ‘Embarked‘]
#     drop          onehot   drop    0/1    num    num       num      drop     num      0/1    用S补空,onehot



# In[22]:


X = train.drop([Survived,PassengerId,Name], axis=1)
y = train[Survived]
Xtest = test.drop([PassengerId,Name], axis=1)
# print(‘X:‘,X.shape)
# print(‘y:‘,y.shape)
# print(‘Xtest:‘,Xtest.shape)


# In[23]:


key = [col for col in X if X[col].dtype != object # numberic [‘Pclass‘, ‘Age‘, ‘SibSp‘, ‘Parch‘, ‘Fare‘]
       or col == Sex
       or col == Embarked
       or col == Cabin
      ] 
X = X[key]
Xtest = Xtest[key]
# print(key)

def showNullNum(a,b):
    print(a.isnull().sum())
    print()
    print(b.isnull().sum())
    print(------------------------------------)

showNullNum(X,Xtest)

# Xtest[‘Fare‘][Xtest[‘Fare‘].isnull()] = Xtest[‘Fare‘].median() # replace nan with median
# X = X.dropna(axis=0) # drop X and y in the same row

#-------------------------------------------------------------------------------
# Pclass    Ticket class
# 1 = 1st, 2 = 2nd, 3 = 3rd  onehot
# for i in X[‘Pclass‘].unique():
#     X[‘Pclass_‘+str(i)] = (X[‘Pclass‘]==i).astype(int)
#     Xtest[‘Pclass_‘+str(i)] = (Xtest[‘Pclass‘]==i).astype(int)

# X = X.drop([‘Pclass‘],axis=1)
# Xtest = Xtest.drop([‘Pclass‘],axis=1)

#-------------------------------------------------------------------------------
# Sex
X[Sex] = X[Sex].apply(lambda i:1 if i==female else 0)
Xtest[Sex] = Xtest[Sex].apply(lambda i:1 if i==female else 0)

#-------------------------------------------------------------------------------
# Embarked

# 1 label encoding
X[Embarked][X[Embarked].isnull()] = S
X[Embarked] = X[Embarked].map({S:0,C:1,Q:2}).astype(int)
Xtest[Embarked] = Xtest[Embarked].map({S:0,C:1,Q:2}).astype(int)
# or use sklearn.preprocessing.LabelEncoder




# print(X.head())
# print(Xtest.head())

# X[‘Embarked‘][X[‘Embarked‘].isnull()] = ‘S‘
# from sklearn import preprocessing
# le = preprocessing.LabelEncoder()
# X[‘Embarked‘] = le.fit_transform(X[‘Embarked‘])
# Xtest[‘Embarked‘] = le.transform(Xtest[‘Embarked‘])

# print(X.head())
# print(Xtest.head())




# 2 onehot
# for i in X[‘Embarked‘].unique():
#     print(i, ‘sum:‘, sum(X[‘Embarked‘]==i))

# X[‘Embarked‘][X[‘Embarked‘].isnull()] = ‘S‘ # most_frequent
# for i in X[‘Embarked‘].unique():
#     X[‘Embarked_type_‘+i] = (X[‘Embarked‘]==i).astype(int)
#     Xtest[‘Embarked_type_‘+i] = (Xtest[‘Embarked‘]==i).astype(int)
    
# X = X.drop([‘Embarked‘],axis=1)
# Xtest = Xtest.drop([‘Embarked‘],axis=1)
# print(X.head(10))

#-------------------------------------------------------------------------------
# Cabin
# has a cabin or not
# print(X.head(5))
Xtest[Cabin] = Xtest[Cabin].apply(lambda i:1 if isinstance(i,str) else 0)
X[Cabin] = X[Cabin].apply(lambda i:1 if isinstance(i,str) else 0)
# print(X.head(5))


#-------------------------------------------------------------------------------
# age and fare
# use median to replace nan 
from sklearn.preprocessing import Imputer
ip = Imputer(strategy=median)
X = ip.fit_transform(X)
Xtest = ip.transform(Xtest)
print(np.isnan(X).sum(),np.isnan(Xtest).sum())



# In[24]:


from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X,y)
ans = xgb.predict(Xtest)

# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import ExtraTreesClassifier
# from sklearn.svm import LinearSVC



# In[25]:


out = pd.DataFrame({PassengerId:test[PassengerId],Survived:ans})
out.to_csv(out_path,index = False)
print(ok)


# In[26]:


from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit([a,b,c])
ans = le.transform([a,a,c])
print(ans)

 

kaggle Titanic

标签:sse   import   strategy   pass   roc   dataframe   col   桌面   ber   

原文地址:https://www.cnblogs.com/cbattle/p/8919752.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!