# 1 分析

（1）数据清洗

• 性别、出生日期有缺失值，需要剔除

（2）特征工程

• 性别、是否手术需要0-1编码
• 根据出生日期和住院日期计算年龄
• 根据DRGS编码判断并发症严重程度。1、5、7分别表示严重、一般、无

# 2 实现

（1）数据清洗

def clear_data(data):
c_data = data
datalen = len(data)
for i in range(0,datalen):
born_year = c_data['born'][i]
sex = c_data['sex'][i]
if born_year=='0 AM' or sex=='未知':
c_data.drop([i],inplace=True)
print()
c_data.to_csv('clear_data.csv')
print()

（2）性别编码

def gender_code(data):
datalen = len(data)
gender =[]
for i in range(0,datalen):
sex = data['sex'][i]
if sex=='未知':
continue
elif sex=='男':
gender.append(1)
else:
gender.append(0)
return gender

（3）是否手术编码

def surgery_code(data):
datalen = len(data)
sur =[]
for i in range(0,datalen):
s = data['surgery'][i]
if pd.isnull(s):
sur.append(0)
else:
sur.append(1)
return sur

（4）计算年龄

# 年龄计算
def age_code(data):
age_list =[]
datalen = len(data)
for i in range(0,datalen):
born_year = data['born'][i]
if born_year=='0 AM':
continue
else:
intime = ''.join(data['intime'][i])
in_year = intime.strip().split("/")
age = int(in_year[2])-int(born_year)
age_list.append(age)
return age_list

（5）并发症复杂程度编码

def complicate_code(data):
text_len =[]
datalen = len(data)
g_data = pd.DataFrame()
for i in range(0,datalen):
one_lines = data['drgsid'][i]
text_len.append(one_lines[-1])
return text_len

（6）给数据集打标签

def label_code(data):
category={}
cate_box = pd.DataFrame()
datalen = len(data)
for i in range(0,datalen):
data_fee = data['fee'][i]
templist.append(data_fee)
else:
mdict ={}
for  k in category.keys():
maxn =max(category[k])
minn =min(category[k])
avg = maxn-minn
low = avg*(4/10)
mid = avg*(4/10)
high =avg*(2/10)
inteval_n = [minn,minn+low,minn+low+mid,minn+low+mid+high]
mdict[k] = inteval_n
labelfee = []
for i in range(0,datalen):
dfee = float(data['fee'][i])
if dfee>=minn and lowd>=dfee:
labelfee.append(1)
elif dfee>lowd and midd>=dfee:
labelfee.append(2)
else:
labelfee.append(3)
# print(labelfee)
return labelfee