import numpy as np
import pandas as pd
names=['city','addr','rule','price','size','additional','zdate','unit']
df=pd.read_csv('/data/nanjing.csv',names=names)
df.head()
city addr rule price size additional zdate unit
0 南京 鼓楼-青云巷 3室1厅1卫 900 12㎡ 电视 冰箱 洗衣机 空调 热水器 床 暖气 宽带 衣柜 天然气 2020-03-08 元/月(季付价)
1 南京 鼓楼-南东瓜市 2室0厅1卫 1600 20㎡ NaN 2020-03-05 元/月
2 南京 鼓楼-五塘和园 4室1厅1卫 1390 12㎡ 冰箱 洗衣机 空调 热水器 床 宽带 衣柜 2020-03-05 元/月(月付价)
3 南京 鼓楼-凤凰二村 1室1厅1卫 2000 27㎡ NaN 2020-03-06 元/月
4 南京 鼓楼-中海桃源里 4室1厅1卫 1090 10㎡ 冰箱 洗衣机 空调 热水器 床 宽带 衣柜 2020-03-08 元/月(月付价)
df = df.fillna(method='ffill')
df[df.isnull().T.any()]
city addr rule price size additional zdate unit
hall=df['rule'].str.extract('.(\d)')
room=df['rule'].str.extract('(\d)')
room=room.astype('int64')
hall=hall.astype('int64')
df['rule']=(room+(hall*2))/2
df['size']=df['size'].str.extract('(\d+)')
df['size']=df['size'].astype('int64')
def additionals(a):
    counts=[]
    counts=a.split(' ')
    return len(counts)
df['additional']=df['additional'].apply(additionals)
data=df.groupby(df['zdate']).mean()
data.tail()
rule price size additional
zdate
2020-03-05 2.408271 1937.629323 49.803008 7.490977
2020-03-06 2.412957 1918.367250 52.073132 7.460254
2020-03-07 2.332008 2004.573890 50.938370 7.460570
2020-03-08 2.511217 1770.254929 35.565942 7.208702
2020-03-09 2.722754 1530.810778 30.033533 7.217964
counts=df.groupby('zdate').size()
counts.tail()
zdate
2020-03-05    1330
2020-03-06    1258
2020-03-07    1509
2020-03-08    2942
2020-03-09     835
dtype: int64



from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
X=data
y=counts
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=125)
## 建立线性回归模型
clf = LinearRegression().fit(X_train,y_train)
print('建立的LinearRegression模型为:','\n',clf)
建立的LinearRegression模型为: 
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)


y_pred = clf.predict(X_test)
print(len(y_pred))
print('预测前20个结果为:','\n',y_pred[:20])
26
预测前20个结果为: 
 [ 64.04629768  72.32580809 266.94981422 254.43703779 134.24151951
   0.80319776 122.11897184  70.2876959    5.53085201 232.45575409
 185.17606994 225.12269869  87.68120282 221.29098145 150.57651981
 -27.12364545 522.41034522 203.97223093 203.90648176  66.07014051]

Last modification:May 5th, 2020 at 07:05 pm
如果觉得我的文章对你有用,请随意赞赏