# 题目

## 江西省数学建模-某肿瘤疾病诊疗的经济学分析

1、建立根据不同疾病的分类模型。建立诊疗费用与疾病类型的数学关系，并进行预测和检验。
2、建立数学模型分析诊疗费用与各类疾病的亚群的特征，比如，高费用人群的年龄，性别，住院日期和相关数据的相关性，尝试对特定的亚群建立预测模型并进行验证。
3、如果该疾病纳入医保，尝试给出根据疾病类型、建议年龄段和国家承担的经济费用的方案并对相关方案合理性和经济性作出评估。

# 2 数据集主要特征分析

``````import numpy as np
import pandas as pd
train_data_file = './cdata.csv'
if __name__ =="__main__":
print()
t_data.describe()
``````

（1）数据长度：17739
（2）主要诊断类别:183种

``````def maindiag_extract(data):
text_len =[]
datalen = len(data)
for i in range(0,datalen):
one_lines = ''.join(list(data['maindiag'][i]))
text_id = one_lines.strip().split("|")
text_len.append(text_id[0])
all_category = list(set(text_len))
print(all_category)
print(len(all_category))
print()
```
（3）次要诊断类别:803
```python
def elsediag_extract(data):
text_len =[]
datalen = len(data)
for i in range(0,datalen):
nontext = data['elsediag'][i]
if pd.isnull(nontext):
continue
one_lines = ''.join(list(nontext))
text = one_lines.strip().split(",")
for j in range(len(text)):
text_id = text[j].strip().split("|")
text_len.append(text_id[0])
all_category = list(set(text_len))
print(all_category)
print(len(all_category))
``````

（4）DRGs类别数:72类

``````def drgs_extract(data):
text_len =[]
datalen = len(data)
for i in range(0,datalen):
text_id = data['drgsid'][i]
text_len.append(text_id)
all_category = list(set(text_len))
print(all_category)
print(len(all_category))
print()
``````

（5）DRGS分组平均费用分布分析

``````import numpy as np
import pandas as pd
# import tensorflow as tf
from category_encoders.target_encoder import TargetEncoder
import matplotlib.pyplot as plt
import statsmodels.api as sm
def fee_range(data):
text_len =[]
# category =[]
category={}
feelist =[]
datalen = len(data)
for i in range(0,datalen):
text_id = data['drgsid'][i]
data_fee = data['fee'][i]
feelist.append(data_fee)
category[text_id] =list(set(feelist))
ncate ={}
for k in category.keys():
# 取每个分组下的费用平均
ncate[k] = np.mean(category[k])

a_cate = dict(sorted(ncate.items(), key=lambda x: x[1], reverse=True))
x = list(a_cate.keys())
y = list(a_cate.values())
plt.scatter(x, y, alpha=0.9)  # 绘制散点图，透明度为0.6（这样颜色浅一点，比较好看）
plt.show()
print(a_cate)
print()
if __name__ =="__main__":
fee_range(t_data)
print()
``````

（6）DRGS分组类别分布

``````def box_line(data):
text_len =[]
# category =[]
category={}
cate_box = pd.DataFrame()
datalen = len(data)
# feelist =[]
for i in range(0,datalen):
text_id = data['drgsid'][i]
data_fee = data['fee'][i]
if text_id in category.keys():
templist = list(category[text_id])
templist.append(data_fee)
category[text_id] =list(set(templist))
else:
category[text_id] = [data_fee]
pxy = {}
for k in category.keys():
pxy[k] = len(category[k])
# print(k,len(category[k]))
resultxy = dict(sorted(pxy.items(), key=lambda x: x[1]))
x = list(resultxy.keys())
y = list(resultxy.values())
for j in resultxy.keys():
print(j,resultxy[j])
plt.xlabel('DRGs')
plt.title('Distribution of the number of grouping categories ')
plt.ylabel('The amount of DRGS')
plt.xticks([])
# x = [i for i in range(len(y))]
plt.scatter(x, y, alpha=0.9)  # 绘制散点图，透明度为0.6（这样颜色浅一点，比较好看）
plt.show()
print()
``````

（7）DRGS分组中费用范围箱线图

``````def box_line(data):
text_len =[]
# category =[]
category={}
cate_box = pd.DataFrame()
datalen = len(data)
# feelist =[]
for i in range(0,datalen):
text_id = data['drgsid'][i]
data_fee = data['fee'][i]
if text_id in category.keys():
templist = list(category[text_id])
templist.append(data_fee)
category[text_id] =list(set(templist))
else:
category[text_id] = [data_fee]
pxy = {}
for k in category.keys():
pxy[k] = len(category[k])
# print(k,len(category[k]))
sordict = dict(sorted(pxy.items(), key=lambda x: x[1]))
resultxy ={}
for k in sordict.keys():
resultxy[k] = category[k]
for k in resultxy.keys():
templi = list(resultxy[k])
templen = len(templi)
if 4272 > templen:
for i in range(4272-templen):
templi.append(np.nan)
cate_box[k] = templi
cate_box.plot.box(title="Fee-categroy")
plt.grid(linestyle="--", alpha=0.3)
plt.show()

print()
``````

``````def drgs_box_line(data):
category={}
cate_box = pd.DataFrame()
datalen = len(data)
# feelist =[]
for i in range(0,datalen):
data_fee = data['fee'][i]
if text_id in category.keys():
templist = list(category[text_id])
templist.append(data_fee)
category[text_id] =list(set(templist))
else:
category[text_id] = [data_fee]
pxy = {}
for k in category.keys():
pxy[k] = len(category[k])
# print(k,len(sordict[k]))
sordict = dict(sorted(pxy.items(), key=lambda x: x[1]))
resultxy ={}
for k in sordict.keys():
# resultxy[k] = category[k]
print(k,sordict[k])
``````

``````def drgs_box_line(data):
category={}
cate_box = pd.DataFrame()
datalen = len(data)
# feelist =[]
for i in range(0,datalen):
data_fee = data['fee'][i]
if text_id in category.keys():
templist = list(category[text_id])
templist.append(data_fee)
category[text_id] =list(set(templist))
else:
category[text_id] = [data_fee]
pxy = {}
for k in category.keys():
pxy[k] = np.mean(category[k])
# print(k,len(sordict[k]))
sordict = dict(sorted(pxy.items(), key=lambda x: x[1]))
resultxy ={}
for k in sordict.keys():
resultxy[k] = category[k]
# print(k,sordict[k])
for k in resultxy.keys():
templi = list(resultxy[k])
templen = len(templi)
if 4682 > templen:
for i in range(4682-templen):
templi.append(np.nan)
cate_box[k] = templi
cate_box.plot.box(title="Fee-categroy")
plt.grid(linestyle="--", alpha=0.3)
plt.title('Relationship between ADRG and medical fee')
plt.ylabel('medical fee')
plt.show()
``````

（1）数据长度：17739行
（2）主要诊断类别:183种
（3）DRGs类别数:72种
（4）次要诊断类别:803
（7）最后一列属性，是费用异常，可以看到有高费用异常和低费用异常，暂且不知道这些属性有何意义

# 3 数据集亚群特征分析

（1）年龄与平均费用关系折线图

``````def age_static(data):
age_fee ={}
datalen = len(data)
for i in range(0,datalen):
born_year = data['born'][i]
if born_year=='0 AM':
continue
else:
intime = ''.join(data['intime'][i])
in_year = intime.strip().split("/")
age = int(in_year[2])-int(born_year)
data_fee = data['fee'][i]
if age in age_fee.keys():
templist = list(age_fee[age])
templist.append(data_fee)
age_fee[age] =list(templist)
else:
age_fee[age] = [data_fee]
# 计算平均费用
avg_age_fee ={}
for k in age_fee.keys():
avg = np.mean(list(age_fee[k]))
avg_age_fee[k] = avg
sort_avg_fee = dict(sorted(avg_age_fee.items(), key=lambda x: x[0]))
print(sort_avg_fee)
x = list(sort_avg_fee.keys())
y = list(sort_avg_fee.values())
plt.plot(x,y,'b--',label='age-fee')
plt.title('Relationship between age and cost')
plt.xlabel('age')
plt.ylabel('medical-fee')
plt.show()
print()
``````

（2）阶段年龄分布柱状图

30: 25658.83080291971,
40: 25232.891867549668
50: 26072.089125503106
60: 27377.498989296368
70: 32492.331597490345
90: 36317.296185236126}

``````def age_static(data):
age_fee ={}
datalen = len(data)
for i in range(0,datalen):
born_year = data['born'][i]
if born_year=='0 AM':
continue
else:
intime = ''.join(data['intime'][i])
in_year = intime.strip().split("/")
age = int(in_year[2])-int(born_year)
data_fee = data['fee'][i]
if age in age_fee.keys():
templist = list(age_fee[age])
templist.append(data_fee)
age_fee[age] =list(templist)
else:
age_fee[age] = [data_fee]
# 计算平均费用
avg_age_fee ={}
for k in age_fee.keys():
avg = np.mean(list(age_fee[k]))
avg_age_fee[k] = avg
sort_avg_fee = dict(sorted(avg_age_fee.items(), key=lambda x: x[0]))
#绘制直方图，阶段年龄与平均费用的
li30 =[]
li40 =[]
li50 =[]
li60 =[]
li70 =[]
limax =[]
n_age_fee = {}
for k in age_fee.keys():
age = int(k)
if age <=30:
li30.extend(age_fee[k])
elif age <=40:
li40.extend(age_fee[k])
elif age <=50:
li50.extend(age_fee[k])
elif age <=60:
li60.extend(age_fee[k])
elif age <=70:
li70.extend(age_fee[k])
else:
limax.extend(age_fee[k])
n_age_fee[30] = li30
n_age_fee[40] = li40
n_age_fee[50] = li50
n_age_fee[60] = li60
n_age_fee[70] = li70
n_age_fee[90] = limax
# 计算平均费用
level_age_fee ={}
for k in n_age_fee.keys():
avg = np.mean(list(n_age_fee[k]))
level_age_fee[k] = avg
sort_level_age_fee = dict(sorted(level_age_fee.items(), key=lambda x: x[0]))
x = ['<=30','31-40','41-50','51-60','61-70','>=70']
y = list(sort_level_age_fee.values())
plt.title('Relationship between age_range and cost')
plt.xlabel('age_range')
plt.ylabel('medical-fee')
for a,b in zip(x,y):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x, y)
plt.show()
print()
``````

``````def age_static(data):
age_fee ={}
datalen = len(data)
for i in range(0,datalen):
born_year = data['born'][i]
if born_year=='0 AM':
continue
else:
intime = ''.join(data['intime'][i])
in_year = intime.strip().split("/")
age = int(in_year[2])-int(born_year)
data_fee = data['fee'][i]
if age in age_fee.keys():
templist = list(age_fee[age])
templist.append(data_fee)
age_fee[age] =list(templist)
else:
age_fee[age] = [data_fee]
# 计算平均费用
avg_age_fee ={}
for k in age_fee.keys():
avg = np.mean(list(age_fee[k]))
avg_age_fee[k] = avg
sort_avg_fee = dict(sorted(avg_age_fee.items(), key=lambda x: x[0]))
#绘制直方图，阶段年龄与平均费用的
li30 =[]
li40 =[]
li50 =[]
li60 =[]
li70 =[]
limax =[]
n_age_fee = {}
for k in age_fee.keys():
age = int(k)
if age <=30:
li30.extend(age_fee[k])
elif age <=40:
li40.extend(age_fee[k])
elif age <=50:
li50.extend(age_fee[k])
elif age <=60:
li60.extend(age_fee[k])
elif age <=70:
li70.extend(age_fee[k])
else:
limax.extend(age_fee[k])
n_age_fee[30] = len(li30)
n_age_fee[40] = len(li40)
n_age_fee[50] = len(li50)
n_age_fee[60] = len(li60)
n_age_fee[70] = len(li70)
n_age_fee[90] = len(limax)
# 计算平均费用
sort_level_age_fee = dict(sorted(n_age_fee.items(), key=lambda x: x[0]))
x = ['<=30','31-40','41-50','51-60','61-70','>=70']
y = list(sort_level_age_fee.values())
plt.title('Relationship between age_range and population')
plt.xlabel('age_range')
plt.ylabel('population')
for a,b in zip(x,y):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x, y)
plt.show()
print()
``````

（4）性别和人口以及费用关系的柱状图

``````def sex_static(data):
sex_fee ={}
datalen = len(data)
male =[]
female =[]
for i in range(0,datalen):
sex = data['sex'][i]
if sex=='未知':
continue
elif sex=='男':
male.append(data['fee'][i])
else:
female.append(data['fee'][i])
sex_fee['male'] = male
sex_fee['female'] = female
# 计算平均费用
avg_sex_fee ={}
for k in sex_fee.keys():
avg = np.mean(list(sex_fee[k]))
avg_sex_fee[k] = avg
n_sex_fee={}
n_sex_fee['male'] = len(sex_fee['male'])
n_sex_fee['female'] = len(sex_fee['female'])

x = ['male','female']
y1 = list(avg_sex_fee.values())
y2 = list(n_sex_fee.values())
plt.figure()
plt.title('Relationship between gender and cost')
plt.xlabel('gender')
plt.ylabel('medical-fee')
for a,b in zip(x,y1):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x, y1)
plt.figure()
plt.title('Relationship between gender and population')
plt.xlabel('gender')
plt.ylabel('population')
for a,b in zip(x,y2):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x, y2)
plt.show()
print()
``````

（5）住院时长与平均费用的关系

``````def duration_static(data):
duration_fee ={}
datalen = len(data)
for i in range(0,datalen):
intime  = ''.join(data['intime'][i]).strip()
outtime = ''.join(data['outtime'][i]).strip()
data_fee = data['fee'][i]
date1=datetime.datetime.strptime(outtime[0:10],"%m/%d/%Y")
date2=datetime.datetime.strptime(intime[0:10],"%m/%d/%Y")
day =(date1-date2).days
if int(day)>300:
continue
if day in duration_fee.keys():
templist = list(duration_fee[day])
templist.append(data_fee)
duration_fee[day] =list(templist)
else:
duration_fee[day] = [data_fee]
# 计算平均费用
avg_duration_fee ={}
for k in duration_fee.keys():
avg = np.mean(list(duration_fee[k]))
avg_duration_fee[k] = avg
sort_avg_fee = dict(sorted(avg_duration_fee.items(), key=lambda x: x[0]))
#绘制直方图，阶段年龄与平均费用的
li01 =[]
li25 =[]
li69 =[]
li60 =[]
li100 =[]
n_duration_fee = {}
a_duration_fee ={}
for k in duration_fee.keys():
day = int(k)
if day ==0 or day ==1:
li01.extend(duration_fee[k])
elif day <=5:
li25.extend(duration_fee[k])
elif day <=9:
li69.extend(duration_fee[k])
elif day <=60:
li60.extend(duration_fee[k])
else:
li100.extend(duration_fee[k])
n_duration_fee[1] = len(li01)
n_duration_fee[25] = len(li25)
n_duration_fee[69] = len(li69)
n_duration_fee[60] = len(li60)
n_duration_fee[100] = len(li100)
# 计算平均费用
a_duration_fee[1] = np.mean(li01)
a_duration_fee[25] = np.mean(li25)
a_duration_fee[69] = np.mean(li69)
a_duration_fee[60] = np.mean(li60)
a_duration_fee[100] = np.mean(li100)
sort_level_duration_fee = dict(sorted(a_duration_fee.items(), key=lambda x: x[0]))
'''
x1 = list(sort_avg_fee.keys())
x2 = ['0-1','2-5','6-9','10-60','>=60']
y1 = list(sort_avg_fee.values())
y2 = list(sort_level_duration_fee.values())
plt.title('Relationship between hospital-time and medical fee')
plt.xlabel('day range')
plt.ylabel('medical fee')
# plt.xticks([])
plt.scatter(x1, y1, alpha=0.9)  # 绘制散点图，透明度为0.6（这样颜色浅一点，比较好看）
plt.figure()
plt.title('Relationship between hospital-time and population')
plt.xlabel('day range')
plt.ylabel('population')
for a,b in zip(x2,y2):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x2, y2)
plt.show()
'''
x3 = ['0-1','2-5','6-9','10-60','>=60']
y3 = list(sort_level_duration_fee.values())
plt.title('Relationship between hospital-time and medical fee')
plt.xlabel('day range')
plt.ylabel('medical fee')
for a,b in zip(x3,y3):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x3, y3)
plt.show()
print()
``````

（6）住院时长与费用的关系

（7）有无并发症与费用的关系，有无并发症与人口数量的关系

``````def complication_static(data):
com_fee ={}
datalen = len(data)
serious =[]
general =[]
non = []
for i in range(0,datalen):
drgsid = ''.join(data['drgsid'][i]).strip()
lastid = int(drgsid[-1])
if lastid==1:
serious.append(data['fee'][i])
elif lastid==3:
general.append(data['fee'][i])
elif lastid==5:
non.append(data['fee'][i])
else:
continue
com_fee['serious'] = serious
com_fee['general'] = general
com_fee['non'] = non
# 计算平均费用
avg_com_fee ={}
for k in com_fee.keys():
avg = np.mean(list(com_fee[k]))
avg_com_fee[k] = avg
n_com_fee={}
n_com_fee['serious'] = len(com_fee['serious'])
n_com_fee['general'] = len(com_fee['general'])
n_com_fee['non'] = len(com_fee['non'])
x = ['serious','general','non']
y1 = list(avg_com_fee.values())
y2 = list(n_com_fee.values())
plt.figure()
plt.title('Relationship between complication and medical fee')
plt.xlabel('complication')
plt.ylabel('medical fee')
for a,b in zip(x,y1):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x, y1)
plt.figure()
plt.title('Relationship between complication and population')
plt.xlabel('complication')
plt.ylabel('population')
for a,b in zip(x,y2):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x, y2)
plt.show()
print()
``````

（1）费用与年龄呈非线性关系，年龄越大，平均费用越高
（2）男性比女性人数多，男性比女性平均费用高
（3）并发症分为三种，严重、一般、无，根据数据分析发现，一般的并发症费用较低，严重的并发症费用最高，得一般并发症的人数最多。
（4）住院时长与费用呈线性关系，住院费用随着住院时长而线性增长。