forked from t1modeler/data_script
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuci_051_hcc_survival.py
114 lines (101 loc) · 3.91 KB
/
uci_051_hcc_survival.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import urllib.request
import io
import zipfile
import pandas # install pandas by "pip install pandas", or install Anaconda distribution (https://www.anaconda.com/)
# Warning: the data processing techniques shown below are just for concept explanation, which are not best-proctices
# data set repository
# https://archive.ics.uci.edu/ml/datasets/HCC+Survival
# if the file is on your local device, change url_data_train into local file path, e.g., 'D:\local_file.data'
url_data_train = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00423/hcc-survival.zip'
def download_file(url):
resp = urllib.request.urlopen(url)
if resp.status != 200:
resp.close()
raise ValueError('Error: {0}'.format(resp.reason))
print('\rStarted', end = '\r')
content_length = resp.getheader('Content-Length')
if content_length is None:
content_length = '(total: unknown)'
else:
content_length = int(content_length)
if content_length < 1024:
content_length_str = '(total %.0f Bytes)' % content_length
elif content_length < 1024 * 1024:
content_length_str = '(total %.0f KB)' % (content_length / 1024)
else:
content_length_str = '(total %.1f MB)' % (content_length / 1024 / 1024)
total = bytes()
while not resp.isclosed():
total += resp.read(10 * 1024)
if len(total) < 1024:
print(('\rDownloaded: %.0f Bytes ' % len(total)) + content_length_str + ' ', end = '\r')
if len(total) < 1024 * 1024:
print(('\rDownloaded: %.0f KB ' % (len(total) / 1024)) + content_length_str + ' ', end = '\r')
else:
print(('\rDownloaded: %.1f MB ' % (len(total) / 1024 / 1024)) + content_length_str + ' ', end = '\r')
print()
return io.BytesIO(total)
# download data from UCI Machine Learning Repository
data_train = download_file(url_data_train) if url_data_train.startswith('http') else url_data_train
columns = [
'Gender',
'Symptoms',
'Alcohol',
'Hepatitis B Surface Antigen',
'Hepatitis B e Antigen',
'Hepatitis B Core Antibody',
'Hepatitis C Virus Antibody',
'Cirrhosis',
'Endemic Countries',
'Smoking',
'Diabetes',
'Obesity',
'Hemochromatosis',
'Arterial Hypertension',
'Chronic Renal Insufficiency',
'Human Immunodeficiency Virus',
'Nonalcoholic Steatohepatitis',
'Esophageal Varices',
'Splenomegaly',
'Portal Hypertension',
'Portal Vein Thrombosis',
'Liver Metastasis',
'Radiological Hallmark',
'Age at diagnosis',
'Grams of Alcohol per day',
'Packs of cigarets per year',
'Performance Status',
'Encefalopathy degree',
'Ascites degree',
'International Normalised Ratio',
'Alpha-Fetoprotein (ng/mL)',
'Haemoglobin (g/dL)',
'Mean Corpuscular Volume (fl)',
'Leukocytes(G/L)',
'Platelets (G/L)',
'Albumin (mg/dL)',
'Total Bilirubin(mg/dL)',
'Alanine transaminase (U/L)',
'Aspartate transaminase (U/L)',
'Gamma glutamyl transferase (U/L)',
'Alkaline phosphatase (U/L)',
'Total Proteins (g/dL)',
'Creatinine (mg/dL)',
'Number of Nodules',
'Major dimension of nodule (cm)',
'Direct Bilirubin (mg/dL)',
'Iron (mcg/dL)',
'Oxygen Saturation (%)',
'Ferritin (ng/mL)',
'Class']
# unzip the downloaded file, and get data files
with zipfile.ZipFile(data_train) as myzip:
with myzip.open('hcc-survival/hcc-data.txt') as myfile:
df_train = pandas.read_csv(myfile, header = None, names = columns, na_values = '?')
# the target variable, inserted into the dataframe as the first column, and drop the original Class variable
df_train.insert(0, 'target_Class', df_train['Class'])
df_train = df_train.drop('Class', axis = 1)
# save the dataframe as CSV file, you can zip it, upload it to t1modeler.com, and build a model
df_train.to_csv('uci_051_hcc_survival.csv', index = False)