-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcancer_predictions.sql
More file actions
239 lines (208 loc) · 5.82 KB
/
cancer_predictions.sql
File metadata and controls
239 lines (208 loc) · 5.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
CREATE DATABASE cancer_predictions;
USE cancer_predictions;
SELECT * FROM cancer_data;
-- Check for duplicates
WITH duplicate_cte AS
(
SELECT *,
ROW_NUMBER() OVER(PARTITION BY age, gender, smoking, geneticRisk, physicalActivity, alcoholIntake, CancerHistory, diagnosis) AS row_num
FROM cancer_data
)
SELECT *
FROM duplicate_cte
WHERE row_num > 1;
-- Create derived columns
-- Age Group
ALTER TABLE cancer_data
ADD COLUMN age_group VARCHAR(10) AFTER age;
UPDATE cancer_data
SET age_group =
CASE
WHEN age < 20 THEN '0-19'
WHEN age BETWEEN 20 AND 29 THEN '20-29'
WHEN age BETWEEN 30 AND 39 THEN '30-39'
WHEN age BETWEEN 40 AND 49 THEN '40-49'
WHEN age BETWEEN 50 AND 59 THEN '50-59'
WHEN age BETWEEN 60 AND 69 THEN '60-69'
ELSE '70+'
END;
-- Gender
ALTER TABLE cancer_data
ADD COLUMN gender_label VARCHAR(6) AFTER gender;
UPDATE cancer_data
SET gender_label =
CASE
WHEN gender = 1 THEN 'Female'
ELSE 'Male'
END;
-- Smoking
ALTER TABLE cancer_data
ADD COLUMN smoking_status VARCHAR(3) AFTER smoking;
UPDATE cancer_data
SET smoking_status =
CASE
WHEN smoking = 1 THEN 'Yes'
ELSE 'No'
END;
-- Genetic Risk
ALTER TABLE cancer_data
ADD COLUMN genetic_risk VARCHAR(6) AFTER geneticRisk;
UPDATE cancer_data
SET genetic_risk =
CASE
WHEN geneticRisk = 0 THEN 'Low'
WHEN geneticRisk = 1 THEN 'Medium'
ELSE 'High'
END;
-- Cancer History
ALTER TABLE cancer_data
ADD COLUMN cancer_history VARCHAR(3) AFTER cancerHistory;
UPDATE cancer_data
SET cancer_history =
CASE
WHEN cancerHistory = 1 THEN 'Yes'
ELSE 'No'
END;
-- Diagnosis
ALTER TABLE cancer_data
ADD COLUMN diagnosis_status VARCHAR(10) AFTER Diagnosis;
UPDATE cancer_data
SET diagnosis_status =
CASE
WHEN Diagnosis = 1 THEN 'Cancer'
ELSE 'No Cancer'
END;
-- Genetic risk distribution within different age group
SELECT
age_group,
genetic_risk,
COUNT(*) as count
FROM cancer_data
GROUP BY age_group, genetic_risk
ORDER BY count DESC, genetic_risk DESC;
-- Average of patients with cancer diagnosis(0= No, 1=Yes)
SELECT diagnosis, AVG(Age) as avg_age
FROM cancer_data
GROUP BY Diagnosis;
-- Distribution of gender with cancer diagnosis
SELECT
gender_label,
diagnosis_status,
COUNT(*) as count
FROM cancer_data
GROUP BY gender_label, diagnosis_status
ORDER BY count DESC, diagnosis_status DESC;
-- Male to Female count of cancer diagnosis
SELECT
gender_label,
genetic_risk,
diagnosis_status,
COUNT(*) as count
FROM cancer_data
GROUP BY gender_label, genetic_risk, diagnosis_status
ORDER BY count DESC, diagnosis_status DESC;
-- smoking affects cancer diagnosis?
SELECT
smoking_status,
diagnosis_status, COUNT(*) as count
FROM cancer_data
GROUP BY smoking_status, diagnosis_status
ORDER BY count DESC, diagnosis_status DESC;
-- Count of paitents with/without cancer diagnosis by gentic risk
SELECT
genetic_risk,
diagnosis_status, COUNT(*) as count
FROM cancer_data
GROUP BY genetic_risk, diagnosis_status
ORDER BY count DESC, diagnosis_status DESC;
-- Avg BMI of diagnosis
SELECT
diagnosis_status,
ROUND(AVG(bmi),2) as avg_bmi
FROM cancer_data
GROUP BY diagnosis_status;
-- AVG of physical activity related to cancer diagnosis
SELECT
diagnosis_status,
ROUND(AVG(physicalActivity),2) as avg_physical_activity
FROM cancer_data
GROUP BY diagnosis_status;
-- alcoholIntake related to cancer diagnosis
SELECT
diagnosis_status,
ROUND(AVG(alcoholIntake),2) as avg_alcohol_intake
FROM cancer_data
GROUP BY diagnosis_status;
-- Avg BMI by level of physical activity grouped by genetic risk level
SELECT
genetic_risk,
CASE
WHEN physicalActivity < 3 THEN 'Low'
WHEN physicalActivity BETWEEN 3 AND 6 THEN 'Medium'
ELSE 'High'
END as physical_activity_level,
ROUND(AVG(bmi),2) as avg_bmi
FROM cancer_data
GROUP BY genetic_risk, physical_activity_level
ORDER BY avg_bmi ASC;
-- Cancer diagnosis rate fro different age groups
SELECT
age_group,
genetic_Risk,
SUM(diagnosis) as cancer_cases,
COUNT(*) as total_cases,
ROUND((SUM(diagnosis) / COUNT(*)) * 100, 2) as cancer_rate
FROM cancer_data
GROUP BY age_group, genetic_risk;
-- Cancer rate of patients grouped by gentisk risk and cancer history
SELECT
genetic_risk,
cancer_history,
SUM(diagnosis) as cancer_cases,
COUNT(*) as total_cases,
ROUND((SUM(diagnosis) / COUNT(*)) * 100, 2) as cancer_rate
FROM cancer_data
GROUP BY genetic_risk, cancer_history
ORDER BY cancer_rate DESC;
-- Correlation between smokers/nonsmokers and alcohol intake with cancer diagnosis
SELECT
smoking_status,
CASE
WHEN alcoholIntake < 1 THEN '0-0.9'
WHEN alcoholIntake BETWEEN 1 AND 2 THEN '1-2'
WHEN alcoholIntake BETWEEN 2 AND 3 THEN '2-3'
WHEN alcoholIntake BETWEEN 3 AND 4 THEN '3-4'
ELSE '4-5'
END as alcohol_intake_range,
diagnosis_status,
COUNT(*) as count
FROM cancer_data
GROUP BY smoking_status, alcohol_intake_range, diagnosis_status
ORDER BY count DESC, diagnosis_status DESC;
-- relation between physical activity, bmi, diagnosis between genders
SELECT
gender_label,
CASE
WHEN physicalActivity < 3 THEN 'Low'
WHEN physicalActivity BETWEEN 3 AND 6 THEN 'Medium'
ELSE 'High'
END as physical_activity_level,
ROUND(AVG(bmi),2) as avg_bmi,
diagnosis_status,
COUNT(*) as count
FROM cancer_data
GROUP BY gender_label, physical_activity_level, diagnosis_status
ORDER BY count DESC, diagnosis_status DESC;
-- Which age group(s) has higher than avg cancer rate
SELECT age_group, cancer_rate
FROM (
SELECT
age_group,
(SUM(diagnosis) / COUNT(*)) * 100 as cancer_rate
FROM cancer_data
GROUP BY age_group
) AS cancer_age_rate
WHERE cancer_rate > (
SELECT (SUM(diagnosis) / COUNT(*)) * 100
FROM cancer_data)
ORDER BY cancer_rate DESC;