from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame
import time
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
path1='datas/household_power_consumption_1000.txt'
df = pd.read_csv(path1, sep=';', low_memory=False)
df.head()
| Date | Time | Global_active_power | Global_reactive_power | Voltage | Global_intensity | Sub_metering_1 | Sub_metering_2 | Sub_metering_3 |
---|
0 | 16/12/2006 | 17:24:00 | 4.216 | 0.418 | 234.84 | 18.4 | 0.0 | 1.0 | 17.0 |
---|
1 | 16/12/2006 | 17:25:00 | 5.360 | 0.436 | 233.63 | 23.0 | 0.0 | 1.0 | 16.0 |
---|
2 | 16/12/2006 | 17:26:00 | 5.374 | 0.498 | 233.29 | 23.0 | 0.0 | 2.0 | 17.0 |
---|
3 | 16/12/2006 | 17:27:00 | 5.388 | 0.502 | 233.74 | 23.0 | 0.0 | 1.0 | 17.0 |
---|
4 | 16/12/2006 | 17:28:00 | 3.666 | 0.528 | 235.68 | 15.8 | 0.0 | 1.0 | 17.0 |
---|
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
Date 1000 non-null object
Time 1000 non-null object
Global_active_power 1000 non-null float64
Global_reactive_power 1000 non-null float64
Voltage 1000 non-null float64
Global_intensity 1000 non-null float64
Sub_metering_1 1000 non-null float64
Sub_metering_2 1000 non-null float64
Sub_metering_3 1000 non-null float64
dtypes: float64(7), object(2)
memory usage: 70.4+ KB
new_df = df.replace('?', np.nan)
datas = new_df.dropna(axis=0, how = 'any')
datas.describe().T
| count | mean | std | min | 25% | 50% | 75% | max |
---|
Global_active_power | 1000.0 | 2.418772 | 1.239979 | 0.206 | 1.806 | 2.414 | 3.308 | 7.706 |
---|
Global_reactive_power | 1000.0 | 0.089232 | 0.088088 | 0.000 | 0.000 | 0.072 | 0.126 | 0.528 |
---|
Voltage | 1000.0 | 240.035790 | 4.084420 | 230.980 | 236.940 | 240.650 | 243.295 | 249.370 |
---|
Global_intensity | 1000.0 | 10.351000 | 5.122214 | 0.800 | 8.400 | 10.000 | 14.000 | 33.200 |
---|
Sub_metering_1 | 1000.0 | 0.000000 | 0.000000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
---|
Sub_metering_2 | 1000.0 | 2.749000 | 8.104053 | 0.000 | 0.000 | 0.000 | 1.000 | 38.000 |
---|
Sub_metering_3 | 1000.0 | 5.756000 | 8.066941 | 0.000 | 0.000 | 0.000 | 17.000 | 19.000 |
---|
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
Date 1000 non-null object
Time 1000 non-null object
Global_active_power 1000 non-null float64
Global_reactive_power 1000 non-null float64
Voltage 1000 non-null float64
Global_intensity 1000 non-null float64
Sub_metering_1 1000 non-null float64
Sub_metering_2 1000 non-null float64
Sub_metering_3 1000 non-null float64
dtypes: float64(7), object(2)
memory usage: 70.4+ KB
def date_format(dt):
import time
t = time.strptime(' '.join(dt), '%d/%m/%Y %H:%M:%S')
return (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec)
X = datas.iloc[:,0:2]
X = X.apply(lambda x: pd.Series(date_format(x)), axis=1)
Y = datas['Global_active_power']
X.head(2)
| 0 | 1 | 2 | 3 | 4 | 5 |
---|
0 | 2006 | 12 | 16 | 17 | 24 | 0 |
---|
1 | 2006 | 12 | 16 | 17 | 25 | 0 |
---|
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
(800, 6)
(200, 6)
(800,)
X_train.describe()
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
pd.DataFrame(X_train).describe()
| 0 | 1 | 2 | 3 | 4 | 5 |
---|
count | 800.0 | 800.0 | 8.000000e+02 | 8.000000e+02 | 8.000000e+02 | 800.0 |
---|
mean | 0.0 | 0.0 | 2.445821e-15 | -8.604228e-17 | 8.104628e-17 | 0.0 |
---|
std | 0.0 | 0.0 | 1.000626e+00 | 1.000626e+00 | 1.000626e+00 | 0.0 |
---|
min | 0.0 | 0.0 | -1.221561e+00 | -1.333814e+00 | -1.722545e+00 | 0.0 |
---|
25% | 0.0 | 0.0 | -1.221561e+00 | -8.377420e-01 | -8.532677e-01 | 0.0 |
---|
50% | 0.0 | 0.0 | 8.186245e-01 | -3.416698e-01 | 1.600918e-02 | 0.0 |
---|
75% | 0.0 | 0.0 | 8.186245e-01 | 1.022529e+00 | 8.852861e-01 | 0.0 |
---|
max | 0.0 | 0.0 | 8.186245e-01 | 1.518601e+00 | 1.696611e+00 | 0.0 |
---|
lr = LinearRegression(fit_intercept=True)
lr.fit(X_train, Y_train)
y_predict = lr.predict(X_test)
print("训练集上R2:",lr.score(X_train, Y_train))
print("测试集上R2:",lr.score(X_test, Y_test))
mse = np.average((y_predict-Y_test)**2)
rmse = np.sqrt(mse)
print("rmse:",rmse)
训练集上R2: 0.24409311805909026
测试集上R2: 0.12551628513735869
rmse: 1.1640923459736248
print("模型的系数(θ):", end="")
print(lr.coef_)
print("模型的截距:", end='')
print(lr.intercept_)
模型的系数(θ):[ 0.00000000e+00 -6.66133815e-16 -1.41588166e+00 -9.34953243e-01
-1.02140756e-01 0.00000000e+00]
模型的截距:2.4454375000000033
from sklearn.externals import joblib
joblib.dump(ss, "result/data_ss.model")
joblib.dump(lr, "result/data_lr.model")
ss3 = joblib.load("result/data_ss.model")
lr3 = joblib.load("result/data_lr.model")
data1 = [[2006, 12, 17, 12, 25, 0]]
data1 = ss3.transform(data1)
print(data1)
lr3.predict(data1)
[[ 0. 0. 0.81862454 0.15440249 -0.27374978 0. ]]
array([1.16996393])
t=np.arange(len(X_test))
plt.figure(facecolor='w')
plt.plot(t, Y_test, 'r-', linewidth=2, label='真实值')
plt.plot(t, y_predict, 'g-', linewidth=2, label='预测值')
plt.legend(loc = 'upper left')
plt.title("线性回归预测时间和功率之间的关系", fontsize=20)
plt.grid(b=True)
plt.show()
X = datas.iloc[:,2:4]
Y2 = datas.iloc[:,5]
X2_train,X2_test,Y2_train,Y2_test = train_test_split(X, Y2, test_size=0.2, random_state=0)
scaler2 = StandardScaler()
X2_train = scaler2.fit_transform(X2_train)
X2_test = scaler2.transform(X2_test)
lr2 = LinearRegression()
lr2.fit(X2_train, Y2_train)
Y2_predict = lr2.predict(X2_test)
print("电流预测准确率: ", lr2.score(X2_test,Y2_test))
print("电流参数:", lr2.coef_)
t=np.arange(len(X2_test))
plt.figure(facecolor='w')
plt.plot(t, Y2_test, 'r-', linewidth=2, label=u'真实值')
plt.plot(t, Y2_predict, 'g-', linewidth=2, label=u'预测值')
plt.legend(loc = 'lower right')
plt.title(u"线性回归预测功率与电流之间的关系", fontsize=20)
plt.grid(b=True)
plt.show()
电流预测准确率: 0.9920420609708968
电流参数: [5.07744316 0.07191391]