kaggle竞赛-预测房价

预测房价

导入数据

1
2
3
4
5
6
7
import numpy as np
import pandas as pd

train_file = 'train.csv'
test_file = 'test.csv'
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

数据预处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#train_df.head(3)
#train_df.shape, test_df.shape
#[i for i in train_df.columns if i not in test_df.columns]

#丢掉前几列,然后合并
all_features = pd.concat([train_df.iloc[:,4:],test_df.iloc[:,3:]],axis=0)
#all_features.shape
#all_features.head(3)

#all_features.dtypes
#all_features.dtypes.unique()
numeric_index = all_features.dtypes[all_features.dtypes != 'object'].index
#list(numeric_index)
all_features[numeric_index] = all_features[numeric_index].apply(lambda x: (x-x.mean())/x.std())
all_features[numeric_index] = all_features[numeric_index].fillna(0)
#miss_values=all_features.isnull().sum() #series类型,查看每列有多少个空值
#miss_values[miss_values>0]

all_features=all_features.fillna(method='bfill',axis=0).fillna(0)
#all_features.dtypes.unique()


#查看one-hot编码的个数
for index in all_features.dtypes[all_features.dtypes == 'object'].index:
#print(type(all_features[index].unique())) #ndarray
print(index.ljust(20), len(all_features[index].unique()))

#添加编码少的两列
features = list(numeric_index)
features.extend(['Type','Bedrooms'])
all_features = all_features[features]
all_features.shape

#one-hot
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features *=1

#df转化为tensor
import torch

train_num = train_df.shape[0]
train_features = all_features.iloc[0:train_num,:]
train_labels = train_df['Sold Price']
test_features = all_features.iloc[train_num:,:]

#转换为tensor
import torch

train_num = train_df.shape[0]
train_features = all_features.iloc[0:train_num,:]
train_labels = train_df['Sold Price']
test_features = all_features.iloc[train_num:,:]

train_features = torch.tensor(train_features.values, dtype=torch.float32)
test_features = torch.tensor(test_features.values, dtype=torch.float32)
train_labels = torch.tensor(train_labels.values.reshape(-1,1), dtype=torch.float32)
print(train_features.shape)
print(train_labels.shape)
print(test_features.shape)

模型定义与训练

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#定义模型
from torch import nn
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
loss = nn.MSELoss()
loss = loss.to(device)
in_features = train_features.shape[1]

def get_net():
net = nn.Sequential(nn.Linear(in_features, 256), nn.ReLU(),nn.Linear(256,1))
net = net.to(device)
return net

#均方rmse误差
def log_rmse(net, features, labels):
y_hat = torch.clamp(net(features), 1, float('inf'))
out = torch.sqrt(loss(torch.log(y_hat), torch.log(labels)))
return out.item()

#训练
from d2l import torch as d2l
def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learning_rate, weight_decay, batch_size):
train_ls, test_ls = [],[]
train_iter = d2l.load_array((train_features, train_labels), batch_size)
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate, weight_decay=weight_decay)
for epoch in range(num_epochs):
for (X,y) in train_iter:
X = X.to(device)
y = y.to(device)

y_hat = net(X)
l = loss(y_hat, y)

optimizer.zero_grad()
l.backward()
optimizer.step()
train_features = train_features.to(device)
train_labels = train_labels.to(device)
train_ls.append(log_rmse(net, train_features, train_labels))
if test_labels is not None:
test_features = test_features.to(device)
test_labels = test_labels.to(device)
test_ls.append(log_rmse(net , test_features, test_labels))
return train_ls, test_ls

#k折交叉验证
def get_k_fold_data(k, i, X, y):
size = X.shape[0] // k
X_train , y_train = None, None
for j in range(k):
idx = slice(j*size,(j+1)*size)
X_part = X[idx]
y_part = y[idx]
if i == j:
X_valid = X_part
y_valid = y_part
elif X_train is None:
X_train, y_train = X_part, y_part
else:
X_train = torch.cat([X_train,X_part],0)
y_train = torch.cat([y_train,y_part],0)
return X_train,y_train,X_valid,y_valid

def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
train_sum , valid_sum = 0,0
for i in range(k):
data = get_k_fold_data(k,i,X_train,y_train)
net = get_net()
train_ls, valid_ls = train(net, *data, num_epochs, learning_rate, weight_decay, batch_size)
train_sum += train_ls[-1]
valid_sum += valid_ls[-1]
print(f'fold {i+1},train log rmse{float(train_ls[-1]):f},' f'valid log rmse {float(valid_ls[-1]):f}')
return trian_sum/k, valid_sum/k

k, num_epochs, lr , weight_decay, batch_size=5,100,0.1,0.1,128
train_l , valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print(f'{k}折验证, 平均训练log rmse:{float(trian_l):f},' f'平均验证log rmse{float(valid_l):f}')


#训练并预测
def train_and_pred(train_features, test_features, train_labels , test_data, num_epochs,lr, weight_decay, batch_size):
net = get_net()
train_ls, _ = train(net, train_features, train_labels, None, None, num_epochs, lr, weight_decay, batch_size)
# d2l.plot(np.arange(1,num_epochs+1),[train_ls], xlabel='epoch',ylabel='log rmse', xlim[1,num_epochs], yscale='log')
print(f'trian log rmse {float(train_ls[-1]):f}')

#网络用于测试集
test_features = test_features.to(device)
out = net(test_features)
out = out.cpu()
preds = out.detach().numpy()
test_data['Sold Price'] = pd.Series(preds[:,0])
submission = pd.concat([test_data['Id'], test_data['Sold Price']],axis=1)
submission.to_csv('submission.csv',index=False)
train_and_pred(train_features, test_features, train_labels, test_df, num_epochs, lr, weight_decay, batch_size)