import pandas as pd
import statsmodels.api as sm
import os
from sklearn import model_selection
Profit = pd.read_csv(r'D:\Desktop\one\forestfires.csv')
pd.set_option('display.max_rows', None)
Profit.set_index(["num"], inplace=True)
# 清除序列号
Profit = Profit[Profit.Area > 0]
s = str(Profit)
lst = s.split('\n')
# 将数据集转换为字符串,并以换行符分割。
d = {}
for i in lst:
if i in d:
d[i] = d.get(i, 0) + 1
else:
d[i] = 1
# 将字符串转换为字典,得到重复出现项的次数。
lst1 = list(d.keys())
lst2 = list(d.values())
# 字典变成字符串。
lst3 = list()
lst4 = list()
for i in lst1:
lst3 = i.split(' ')
lst4.append(lst3)
del lst4[0]
del lst4[0]
del lst2[0]
del lst2[0]
# 清除转换过程产生的无用项
data1 = pd.DataFrame(lst4)
data2 = pd.DataFrame(lst2)
data3 = pd.concat([data1, data2], axis=1)
data3.columns = list('abcdefgh')
data3 = data3.drop(["a", "b","g"], axis=1)
# 清除无用项
train, test = model_selection.train_test_split(data3, test_size=0.2, random_state=4)
# 根据train数据集建模
model = sm.formula.ols('h~c+d+e+f', data=train).fit()
print('模型的偏回归系数分别为:\n', model.params)
# 删除test数据集中的Profit变量,用剩下的自变量进行预测
test_X = test.drop(labels='h', axis=1)
pred = model.predict(exog=test_X)
print('对比预测值和实际值的差异:\n', pd.DataFrame({'Prediction': pred, 'Real': test.h}))
报错:
Traceback (most recent call last):
File "D:\Desktop\one\venv\lib\site-packages\patsy\categorical.py", line 346, in categorical_to_int
out[i] = level_to_int[value]
KeyError: '20.4'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Desktop\one\venv\lib\site-packages\statsmodels\base\model.py", line 1077, in predict
exog = dmatrix(design_info, exog, return_type="dataframe")
File "D:\Desktop\one\venv\lib\site-packages\patsy\highlevel.py", line 290, in dmatrix
(lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
File "D:\Desktop\one\venv\lib\site-packages\patsy\highlevel.py", line 167, in _do_highlevel_design
return build_design_matrices(design_infos, data,
File "D:\Desktop\one\venv\lib\site-packages\patsy\build.py", line 888, in build_design_matrices
value, is_NA = _eval_factor(factor_info, data, NA_action)
File "D:\Desktop\one\venv\lib\site-packages\patsy\build.py", line 84, in _eval_factor
result = categorical_to_int(result, factor_info.categories, NA_action,
File "D:\Desktop\one\venv\lib\site-packages\patsy\categorical.py", line 359, in categorical_to_int
raise PatsyError("Error converting data to categorical: "
patsy.PatsyError: Error converting data to categorical: observation with value '20.4' does not match any of the expected levels (expected: [' 0.1', ' 0.2', ..., '30.7', '32.5'])
h~c+d+e+f
^
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:/Desktop/one/thenext.py", line 46, in <module>
pred = model.predict(exog=test_X)
File "D:\Desktop\one\venv\lib\site-packages\statsmodels\base\model.py", line 1084, in predict
raise exc.__class__(msg)
patsy.PatsyError: predict requires that you use a DataFrame when predicting from a model
that was created using the formula api.
The original error message returned by patsy is:
Error converting data to categorical: observation with value '20.4' does not match any of the expected levels (expected: [' 0.1', ' 0.2', ..., '30.7', '32.5'])
h~c+d+e+f
^
部分结果:
模型的偏回归系数分别为:
Intercept 5.736380
c[T. 0.2] -0.684095
c[T. 3.1] -0.934095
c[T. 4.0] -0.434095
c[T. 4.7] -0.934095
c[T. 5.7] -0.434095
c[T. 7.7] -0.434095
c[T. 8.8] -0.578793
c[T. 9.2] -0.434095
c[T.11.1] -0.434095
c[T.12.2] -0.289397
c[T.12.4] -0.289397
c[T.12.7] -0.289397
c[T.13.2] 0.099492
c[T.14.4] -0.233841
c[T.15.1] 0.099492
c[T.15.3] -0.113289
c[T.15.4] 0.065905
c[T.15.6] 0.119391
c[T.15.7] -0.113289