DengQN·一个普通程序员;
【调包侠的机器学习】医疗保险线性回归
2022-08-22 09:19 65
#线性回归#ml#tf
# https://www.kaggle.com/datasets/mirichoi0218/insurance

import numpy as np 
import tensorflow as tf
import pandas as pd
def mean_norm(df_input): #@save
    return df_input.apply(lambda x: (x-x.mean())/ x.std(), axis=0)
def de_mean_norm(result, df_input:pd.DataFrame): #@save
    return result * df_input.std() +  df_input.mean()
insurenceData = pd.read_csv("../data/insurance.csv")
insurenceData.head(10)

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>age</th> <th>sex</th> <th>bmi</th> <th>children</th> <th>smoker</th> <th>region</th> <th>charges</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>19</td> <td>female</td> <td>27.900</td> <td>0</td> <td>yes</td> <td>southwest</td> <td>16884.92400</td> </tr> <tr> <th>1</th> <td>18</td> <td>male</td> <td>33.770</td> <td>1</td> <td>no</td> <td>southeast</td> <td>1725.55230</td> </tr> <tr> <th>2</th> <td>28</td> <td>male</td> <td>33.000</td> <td>3</td> <td>no</td> <td>southeast</td> <td>4449.46200</td> </tr> <tr> <th>3</th> <td>33</td> <td>male</td> <td>22.705</td> <td>0</td> <td>no</td> <td>northwest</td> <td>21984.47061</td> </tr> <tr> <th>4</th> <td>32</td> <td>male</td> <td>28.880</td> <td>0</td> <td>no</td> <td>northwest</td> <td>3866.85520</td> </tr> <tr> <th>5</th> <td>31</td> <td>female</td> <td>25.740</td> <td>0</td> <td>no</td> <td>southeast</td> <td>3756.62160</td> </tr> <tr> <th>6</th> <td>46</td> <td>female</td> <td>33.440</td> <td>1</td> <td>no</td> <td>southeast</td> <td>8240.58960</td> </tr> <tr> <th>7</th> <td>37</td> <td>female</td> <td>27.740</td> <td>3</td> <td>no</td> <td>northwest</td> <td>7281.50560</td> </tr> <tr> <th>8</th> <td>37</td> <td>male</td> <td>29.830</td> <td>2</td> <td>no</td> <td>northeast</td> <td>6406.41070</td> </tr> <tr> <th>9</th> <td>60</td> <td>female</td> <td>25.840</td> <td>0</td> <td>no</td> <td>northwest</td> <td>28923.13692</td> </tr> </tbody> </table> </div>

sex_mapping = {'female':0, 'male':1}
insurenceData['sex'] =insurenceData['sex'].map(sex_mapping)
insurenceData.head(10)

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>age</th> <th>sex</th> <th>bmi</th> <th>children</th> <th>smoker</th> <th>region</th> <th>charges</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>19</td> <td>0</td> <td>27.900</td> <td>0</td> <td>yes</td> <td>southwest</td> <td>16884.92400</td> </tr> <tr> <th>1</th> <td>18</td> <td>1</td> <td>33.770</td> <td>1</td> <td>no</td> <td>southeast</td> <td>1725.55230</td> </tr> <tr> <th>2</th> <td>28</td> <td>1</td> <td>33.000</td> <td>3</td> <td>no</td> <td>southeast</td> <td>4449.46200</td> </tr> <tr> <th>3</th> <td>33</td> <td>1</td> <td>22.705</td> <td>0</td> <td>no</td> <td>northwest</td> <td>21984.47061</td> </tr> <tr> <th>4</th> <td>32</td> <td>1</td> <td>28.880</td> <td>0</td> <td>no</td> <td>northwest</td> <td>3866.85520</td> </tr> <tr> <th>5</th> <td>31</td> <td>0</td> <td>25.740</td> <td>0</td> <td>no</td> <td>southeast</td> <td>3756.62160</td> </tr> <tr> <th>6</th> <td>46</td> <td>0</td> <td>33.440</td> <td>1</td> <td>no</td> <td>southeast</td> <td>8240.58960</td> </tr> <tr> <th>7</th> <td>37</td> <td>0</td> <td>27.740</td> <td>3</td> <td>no</td> <td>northwest</td> <td>7281.50560</td> </tr> <tr> <th>8</th> <td>37</td> <td>1</td> <td>29.830</td> <td>2</td> <td>no</td> <td>northeast</td> <td>6406.41070</td> </tr> <tr> <th>9</th> <td>60</td> <td>0</td> <td>25.840</td> <td>0</td> <td>no</td> <td>northwest</td> <td>28923.13692</td> </tr> </tbody> </table> </div>

# insurenceData['region'].value_counts()
region_mapping = {'southeast':0, 'southwest':1, "northwest": 2, "northeast": 3}
insurenceData.region =insurenceData.region.map(region_mapping)
insurenceData.head(10)

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>age</th> <th>sex</th> <th>bmi</th> <th>children</th> <th>smoker</th> <th>region</th> <th>charges</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>19</td> <td>0</td> <td>27.900</td> <td>0</td> <td>yes</td> <td>1</td> <td>16884.92400</td> </tr> <tr> <th>1</th> <td>18</td> <td>1</td> <td>33.770</td> <td>1</td> <td>no</td> <td>0</td> <td>1725.55230</td> </tr> <tr> <th>2</th> <td>28</td> <td>1</td> <td>33.000</td> <td>3</td> <td>no</td> <td>0</td> <td>4449.46200</td> </tr> <tr> <th>3</th> <td>33</td> <td>1</td> <td>22.705</td> <td>0</td> <td>no</td> <td>2</td> <td>21984.47061</td> </tr> <tr> <th>4</th> <td>32</td> <td>1</td> <td>28.880</td> <td>0</td> <td>no</td> <td>2</td> <td>3866.85520</td> </tr> <tr> <th>5</th> <td>31</td> <td>0</td> <td>25.740</td> <td>0</td> <td>no</td> <td>0</td> <td>3756.62160</td> </tr> <tr> <th>6</th> <td>46</td> <td>0</td> <td>33.440</td> <td>1</td> <td>no</td> <td>0</td> <td>8240.58960</td> </tr> <tr> <th>7</th> <td>37</td> <td>0</td> <td>27.740</td> <td>3</td> <td>no</td> <td>2</td> <td>7281.50560</td> </tr> <tr> <th>8</th> <td>37</td> <td>1</td> <td>29.830</td> <td>2</td> <td>no</td> <td>3</td> <td>6406.41070</td> </tr> <tr> <th>9</th> <td>60</td> <td>0</td> <td>25.840</td> <td>0</td> <td>no</td> <td>2</td> <td>28923.13692</td> </tr> </tbody> </table> </div>

# insurenceData['smoker'].value_counts()
smoker_mapping = {"yes": 1, "no": 0}
insurenceData.smoker =insurenceData.smoker.map(smoker_mapping)
insurenceData.head(100)

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>age</th> <th>sex</th> <th>bmi</th> <th>children</th> <th>smoker</th> <th>region</th> <th>charges</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>19</td> <td>0</td> <td>27.900</td> <td>0</td> <td>1</td> <td>1</td> <td>16884.92400</td> </tr> <tr> <th>1</th> <td>18</td> <td>1</td> <td>33.770</td> <td>1</td> <td>0</td> <td>0</td> <td>1725.55230</td> </tr> <tr> <th>2</th> <td>28</td> <td>1</td> <td>33.000</td> <td>3</td> <td>0</td> <td>0</td> <td>4449.46200</td> </tr> <tr> <th>3</th> <td>33</td> <td>1</td> <td>22.705</td> <td>0</td> <td>0</td> <td>2</td> <td>21984.47061</td> </tr> <tr> <th>4</th> <td>32</td> <td>1</td> <td>28.880</td> <td>0</td> <td>0</td> <td>2</td> <td>3866.85520</td> </tr> <tr> <th>...</th> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> </tr> <tr> <th>95</th> <td>28</td> <td>0</td> <td>37.620</td> <td>1</td> <td>0</td> <td>0</td> <td>3766.88380</td> </tr> <tr> <th>96</th> <td>54</td> <td>0</td> <td>30.800</td> <td>3</td> <td>0</td> <td>1</td> <td>12105.32000</td> </tr> <tr> <th>97</th> <td>55</td> <td>1</td> <td>38.280</td> <td>0</td> <td>0</td> <td>0</td> <td>10226.28420</td> </tr> <tr> <th>98</th> <td>56</td> <td>1</td> <td>19.950</td> <td>0</td> <td>1</td> <td>3</td> <td>22412.64850</td> </tr> <tr> <th>99</th> <td>38</td> <td>1</td> <td>19.300</td> <td>0</td> <td>1</td> <td>1</td> <td>15820.69900</td> </tr> </tbody> </table> <p>100 rows × 7 columns</p> </div>

rData = mean_norm(insurenceData)
# rData = insurenceData
# print(rData)
Y = rData['charges']

print('1charges: ', Y)
x = rData.drop(columns='charges')
X = tf.convert_to_tensor(x)
print('charges: ', Y)
y = tf.convert_to_tensor(Y)
print(X,y)
1charges:  0       0.298472
1      -0.953333
2      -0.728402
3       0.719574
4      -0.776512
          ...   
1333   -0.220468
1334   -0.913661
1335   -0.961237
1336   -0.930014
1337    1.310563
Name: charges, Length: 1338, dtype: float64
charges:  0       0.298472
1      -0.953333
2      -0.728402
3       0.719574
4      -0.776512
          ...   
1333   -0.220468
1334   -0.913661
1335   -0.961237
1336   -0.930014
1337    1.310563
Name: charges, Length: 1338, dtype: float64
tf.Tensor(
[[-1.4382265  -1.010141   -0.45315057 -0.90827406  1.9698501  -0.40272369]
 [-1.50940108  0.98922092  0.50943062 -0.07873775 -0.50727343 -1.2875255 ]
 [-0.7976553   0.98922092  0.38316358  1.58033487 -0.50727343 -1.2875255 ]
 ...
 [-1.50940108 -1.010141    1.01449877 -0.90827406 -0.50727343 -1.2875255 ]
 [-1.29587735 -1.010141   -0.79751522 -0.90827406 -0.50727343 -0.40272369]
 [ 1.55110577 -1.010141   -0.26129026 -0.90827406  1.9698501   0.48207812]], shape=(1338, 6), dtype=float64) tf.Tensor(
[ 0.2984722  -0.95333272 -0.72840232 ... -0.96123683 -0.93001377
  1.31056344], shape=(1338,), dtype=float64)
net = tf.keras.Sequential()
net.add(tf.keras.layers.Dense(units=1, input_dim=6))
initializer = tf.initializers.RandomNormal(stddev=0.1)
net.add(tf.keras.layers.Dense(1, kernel_initializer=initializer))
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
# metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
net.compile(optimizer=opt, loss=tf.keras.losses.MeanSquaredError(), metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
history = net.fit(X, y, batch_size=1000, epochs=500, validation_split=0.2, callbacks=[], shuffle= True)
net.summary()
Epoch 1/500
2/2 [==============================] - 0s 78ms/step - loss: 0.2575 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2666 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 2/500
2/2 [==============================] - 0s 56ms/step - loss: 0.2573 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2663 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 3/500
2/2 [==============================] - 0s 47ms/step - loss: 0.2570 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2661 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 4/500
2/2 [==============================] - 0s 56ms/step - loss: 0.2568 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2659 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 5/500
2/2 [==============================] - 0s 30ms/step - loss: 0.2566 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2656 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 6/500
2/2 [==============================] - 0s 53ms/step - loss: 0.2565 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2653 - val_sparse_categorical_accuracy: 0.0000e+00

。。。。。。。。。。。。。。。。。。。。。

2/2 [==============================] - 0s 53ms/step - loss: 0.2484 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2521 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 499/500
2/2 [==============================] - 0s 25ms/step - loss: 0.2484 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2520 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 500/500
2/2 [==============================] - 0s 54ms/step - loss: 0.2484 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2520 - val_sparse_categorical_accuracy: 0.0000e+00
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 1)                 7         
                                                                 
 dense_1 (Dense)             (None, 1)                 2         
                                                                 
=================================================================
Total params: 9
Trainable params: 9
Non-trainable params: 0
_________________________________________________________________
import matplotlib.pyplot as plt
print(history.history.keys())
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
# plt.plot(history.history["sparse_categorical_accuracy"], label="sparse_categorical_accuracy")
# plt.plot(history.history["val_sparse_categorical_accuracy"], label="val_sparse_categorical_accuracy")
plt.legend()
plt.show()
dict_keys(['loss', 'sparse_categorical_accuracy', 'val_loss', 'val_sparse_categorical_accuracy'])

png

pY = net.predict([[19,0,27.900,0,1,1]])
pY
array([[11.344333]], dtype=float32)
oriY = insurenceData['charges']
oriY.describe()
count     1338.000000
mean     13270.422265
std      12110.011237
min       1121.873900
25%       4740.287150
50%       9382.033000
75%      16639.912515
max      63770.428010
Name: charges, dtype: float64
# 反归一化
de_mean_norm(pY, oriY)
array([[150650.42]], dtype=float32)