読者です 読者をやめる 読者になる 読者になる

理科系の備忘録

Linux/Ubuntu/Mac/Emacs/Computer vision/Robotics

import pandas as pd
import numpy as np
import csv as csv
from sklearn.ensemble import RandomForestClassifier
#訓練データの読み込み
train_df = pd.read_csv("train.csv", header=0)
train_df.head(3)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
# Sexをダミー変数に変換(female = 0, Male = 1)してGenderカラムを追加
train_df["Gender"] = train_df["Sex"].map( {"female": 0, "male": 1} ).astype(int)
train_df.head(3)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Gender
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 0
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
# 年齢が欠損している(NaN)ものを抽出
train_df.loc[ train_df.Age.isnull() ].head(3)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Gender
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q 1
17 18 1 2 Williams, Mr. Charles Eugene male NaN 0 0 244373 13.0000 NaN S 1
19 20 1 3 Masselmani, Mrs. Fatima female NaN 0 0 2649 7.2250 NaN C 0
# 年齢の欠損値は、年齢の平均値で補完する
median_age = train_df["Age"].dropna().median() # 年齢がNaNの行をドロップしてメディアンをとる
if len(train_df.Age[ train_df.Age.isnull() ]) > 0: # NaN行が1つ上あるなら
    train_df.loc[ (train_df.Age.isnull()), "Age"] = median_age
# 学習に必要無い列を削除する
train_df = train_df.drop(["Name", "Ticket", "Sex", "SibSp", "Parch", "Fare", "Cabin", "Embarked","PassengerId"], axis=1) 
train_df.head(3)
Survived Pclass Age Gender
0 0 3 22.0 1
1 1 1 38.0 0
2 1 3 26.0 0
#テストデータの読み込み, Sexをダミー変数に変換
test_df = pd.read_csv("test.csv", header=0)
test_df["Gender"] = test_df["Sex"].map( {"female": 0, "male": 1} ).astype(int)
test_df.head(3)
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Gender
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q 1
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S 0
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q 1
# 年齢の欠損値は、年齢の平均値で補完する
median_age = test_df["Age"].dropna().median()
if len(test_df.Age[ test_df.Age.isnull() ]) > 0:
    test_df.loc[ (test_df.Age.isnull()), "Age"] = median_age
# テストデータのPassengerId列を退避させ、テストデータの不要な列を削除する
ids = test_df["PassengerId"].values
test_df = test_df.drop(["Name", "Ticket", "Sex", "SibSp", "Parch", "Fare", "Cabin", "Embarked","PassengerId"], axis=1) 
test_df.head(3)
Pclass Age Gender
0 3 34.5 1
1 3 47.0 0
2 2 62.0 1
#ランダムフォレストで予測
train_data = train_df.values
test_data = test_df.values
model = RandomForestClassifier(n_estimators=100)
output = model.fit(train_data[0::,1::], train_data[0::,0]).predict(test_data).astype(int)
#結果を"titanic_submit.csv"として書き出す
submit_file = open("titanic_submit.csv", "w")
file_object = csv.writer(submit_file)
file_object.writerow(["PassengerId","Survived"])
file_object.writerows(zip(ids, output))
submit_file.close()