码迷,mamicode.com
首页 > 其他好文 > 详细

kaggle_Titanic

时间:2017-10-09 17:42:21      阅读:336      评论:0      收藏:0      [点我收藏+]

标签:rac   inpu   std   sse   creat   titan   binary   rom   line   

# -*- coding: utf-8 -*-
"""
Created on Mon Oct  9 14:05:41 2017

@author: lenovo
"""

import numpy as np
import pandas as pd

#载入数据,合并测试集和训练集做特征处理
data_train = pd.read_csv(./input/train.csv)
data_train[id] = train
data_test = pd.read_csv(./input/test.csv)
data_test[id] = test
data = pd.concat((data_train,data_test),axis=0)
#计算各属性的缺失值
for column in data.columns:
    print(column,data[column].isnull().sum())

#填充fare数据
fare_mean = data[data[Fare]>0].groupby(Pclass).mean()[Fare] #查看各个船舱的价格均值
#用价格均值填充缺失价格和为0价格
for i in range(0,3):
    data.loc[(data.Fare.isnull()) & (data.Pclass==i+1),Fare] = fare_mean[i+1]
    data.loc[(data.Fare==0) & (data.Pclass==i+1),Fare] = fare_mean[i+1]
#处理年龄缺失值,用随机森林建模做预测
data_for_age = data[[Age, Fare, Parch, SibSp, Pclass]]
age_exist = data_for_age[data_for_age[Age].notnull()]
age_null = data_for_age[data_for_age[Age].isnull()]
y = age_exist.values[:,0]
x = age_exist.values[:,1:]
x_test = age_null.values[:,1:]
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=200,max_depth=5)
rf.fit(x,y)
y_pred = rf.predict(x_test)
data.loc[(data.Age.isnull()),Age] = y_pred
#处理性别字段,无缺失值直接转成0,1格式
data[Sex] = data[Sex].map({female:0,male:1})
#将Sibsp、Pclass字段one_hot
SibSp = pd.get_dummies(data[SibSp],prefix=SibSp)
Pclass = pd.get_dummies(data[Pclass],prefix=Pclass)
Parch = pd.get_dummies(data[Parch],prefix=Parch)
#处理Embarked缺失值,直接众数填充
data[Embarked].fillna(S,inplace=True)
Embarked = pd.get_dummies(data[Embarked],prefix=Embarked)
#处理Cabin值,缺失直接就当做没有u0
data[data[Cabin].isnull()][Cabin] = u0
Cabin = pd.get_dummies(data[Cabin],prefix=Cabin)
#全部数据合并
data.drop([SibSp,Pclass,Parch,Embarked,Cabin,Name,Ticket,PassengerId],axis=1,inplace=True)
data_all = pd.concat([data,SibSp,Pclass,Parch,Embarked],axis=1)

#建模做预测
data_train = data_all[data_all[id]==train]
data_train.drop(id,axis=1,inplace=True)
data_test = data_all[data_all[id]==test]
data_test.drop([Survived,id],axis=1,inplace=True)
x = data_train.drop(Survived,axis=1).values[:,:]
y = data_train.loc[:,Survived]

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.7,random_state=0)
sc = StandardScaler()
x_train_std = sc.fit_transform(x_train)
x_test_std = sc.transform(x_test)

#logistics 回归
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty=l2)
lr.fit(x_train_std,y_train)
y_pred_lr = lr.predict(x_test_std)
print(Logistic Regression:,metrics.accuracy_score(y_test,y_pred_lr))
#Logistic Regression: 0.809701492537

#决策树
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
dt = DecisionTreeClassifier()
model_dt = GridSearchCV(dt,param_grid={max_depth:range(1,10)},cv=5)
model_dt.fit(x_train_std,y_train)
y_pred_dt = model_dt.predict(x_test_std)
print(Decision Tree:,metrics.accuracy_score(y_test,y_pred_dt))
#Decision Tree: 0.813432835821

#随机森林
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=4,n_estimators=200)
rf.fit(x_train_std,y_train)
y_pred_rf = rf.predict(x_test_std)
y_pred_rf1 = rf.predict(data_test_xgb)
print(RandomForest:,metrics.accuracy_score(y_test,y_pred_rf))
#RandomForest: 0.817164179104


#svm
from sklearn.svm import SVC
svc = SVC(kernel=rbf,decision_function_shape=ovo)
model_svc = GridSearchCV(svc,param_grid={C:np.arange(5,10)/10,gamma:range(10,101,10)},cv=5)
model_svc.fit(x_train_std,y_train)
y_pred_svc = model_svc.predict(x_test_std)

print(SVM:,metrics.accuracy_score(y_test,y_pred_svc))  
#SVM: 0.723880597015

#xgboost
import xgboost as xgb
xgb_train = xgb.DMatrix(x_train_std,label=y_train)
xgb_test = xgb.DMatrix(x_test_std,label=y_test)
param = {max_depth:4,eta:0.3,silent:1,object:binary:logistic}
watchlist = [(xgb_train,train),(xgb_test,test)]
def error_rate(y_hat,y):
    return error,float(sum(y.get_label()!=(y_hat>0.5)))/len(y_hat)
bst = xgb.train(param,xgb_train,evals=watchlist,num_boost_round=4,feval=error_rate)
y_pred_xgb = bst.predict(xgb_test)
print(xgb:,np.average(y_test == (y_pred_xgb>0.5)))  
#XGB: 0.832089552239


#用xgb来做预测
data_test_xgb = sc.transform(data_test)
xgb_test = xgb.DMatrix(data_test_xgb)
y_pred_xgb1 = bst.predict(xgb_test)
y_pred_xgb1[y_pred_xgb1>0.5]=1
y_pred_xgb1[y_pred_xgb1<0.5]=0

#输出到文件
test = pd.read_csv(predictions.csv)
test[Survived]=y_pred_xgb1
test.to_csv(xgb.csv)
test = pd.read_csv(predictions.csv)
test[Survived]=y_pred_rf1
test.to_csv(rf.csv)

 

kaggle_Titanic

标签:rac   inpu   std   sse   creat   titan   binary   rom   line   

原文地址:http://www.cnblogs.com/jiegege/p/7641838.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!