-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPLSQL_ETL_Project.sql
More file actions
359 lines (342 loc) · 13.7 KB
/
PLSQL_ETL_Project.sql
File metadata and controls
359 lines (342 loc) · 13.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
--Procedure for creating ratings table
CREATE OR REPLACE PROCEDURE crt_rat_tbl IS
drpsql VARCHAR2(1000):='DROP TABLE ratings CASCADE CONSTRAINTS PURGE';
crtsql VARCHAR2(1000):='CREATE TABLE ratings (user_id VARCHAR2(4)
CONSTRAINT rat_FK_1
REFERENCES users(user_id),
movie_id VARCHAR2(4)
CONSTRAINT rat_FK_2
REFERENCES movies(movie_id),
rating CHAR(1),
tmst VARCHAR2(10),
CONSTRAINT rat_PK PRIMARY KEY (user_id, movie_id))';
BEGIN
DBMS_OUTPUT.PUT_LINE('Dropping Ratings Table');
EXECUTE IMMEDIATE drpsql;
DBMS_OUTPUT.PUT_LINE('Creating Ratings Table');
EXECUTE IMMEDIATE crtsql;
DBMS_OUTPUT.PUT_LINE('Created Ratings Table');
EXCEPTION
WHEN OTHERS THEN
DBMS_OUTPUT.PUT_LINE('Nothing to drop');
DBMS_OUTPUT.PUT_LINE('Creating Ratings Table');
EXECUTE IMMEDIATE crtsql;
DBMS_OUTPUT.PUT_LINE('Created Ratings Table');
END crt_rat_tbl;
/
--Procedure for loading ratings table
CREATE OR REPLACE PROCEDURE insert_rat_tbl IS
CURSOR in_rat_tbl IS
SELECT *
FROM RATING_RAW;
inssql VARCHAR2(1000):='INSERT INTO ratings VALUES(:a,:b,:c,:d)';
row_count BINARY_INTEGER;
BEGIN
DBMS_OUTPUT.PUT_LINE('Loading ratings Table');
FOR rw IN in_rat_tbl LOOP
EXECUTE IMMEDIATE inssql
USING rw.COL_1,rw.COL_2,rw.COL_3,rw.COL_4;
row_count := in_rat_tbl%ROWCOUNT;
END LOOP;
DBMS_OUTPUT.PUT_LINE(row_count || ' records loaded to Ratings table');
END insert_rat_tbl;
/
--Procedure for creating users table
CREATE OR REPLACE PROCEDURE crt_usr_tbl IS
drpsql VARCHAR2(1000):='DROP TABLE users CASCADE CONSTRAINTS PURGE';
crtsql VARCHAR2(1000):='CREATE TABLE users (user_id VARCHAR2(4) CONSTRAINT usr_pk PRIMARY KEY NOT NULL,
gender CHAR(1),
age VARCHAR2(2)
CONSTRAINT usr_FK_1
REFERENCES age_decode(age_code),
occupation VARCHAR2(2)
CONSTRAINT usr_FK_2
REFERENCES occupation_decode(occupation_code),
ZIP VARCHAR2(10))';
BEGIN
DBMS_OUTPUT.PUT_LINE('Dropping Users Table');
EXECUTE IMMEDIATE drpsql;
DBMS_OUTPUT.PUT_LINE('Creating Users Table');
EXECUTE IMMEDIATE crtsql;
DBMS_OUTPUT.PUT_LINE('Created Users Table');
EXCEPTION
WHEN OTHERS THEN
DBMS_OUTPUT.PUT_LINE('Nothing to drop');
DBMS_OUTPUT.PUT_LINE('Creating Users Table');
EXECUTE IMMEDIATE crtsql;
DBMS_OUTPUT.PUT_LINE('Created Users Table');
END crt_usr_tbl;
/
--Procedure for loading users table
CREATE OR REPLACE PROCEDURE insert_usr_tbl IS
CURSOR in_usr_tbl IS
SELECT *
FROM USERS_RAW;
inssql VARCHAR2(1000):='INSERT INTO users VALUES(:a,:b,:c,:d,:e)';
row_count BINARY_INTEGER;
BEGIN
DBMS_OUTPUT.PUT_LINE('Loading Users Table');
FOR rw IN in_usr_tbl LOOP
EXECUTE IMMEDIATE inssql
USING rw.COLUMN1,rw.COLUMN2,rw.COLUMN3,rw.COLUMN4,rw.COLUMN5;
row_count := in_usr_tbl%ROWCOUNT;
END LOOP;
DBMS_OUTPUT.PUT_LINE(row_count || ' records loaded to Users table');
END insert_usr_tbl;
/
--Procedure for creating movies table
CREATE OR REPLACE PROCEDURE crt_movie_tbl IS
drpsql VARCHAR2(1000):='DROP TABLE movies CASCADE CONSTRAINTS PURGE';
crtsql VARCHAR2(1000):='CREATE TABLE movies (movie_id VARCHAR2(4) CONSTRAINT mov_pk PRIMARY KEY NOT NULL,
title VARCHAR2(100),
movie_year VARCHAR2(4))';
BEGIN
DBMS_OUTPUT.PUT_LINE('Dropping Movies Table');
EXECUTE IMMEDIATE drpsql;
DBMS_OUTPUT.PUT_LINE('Creating Movies Table');
EXECUTE IMMEDIATE crtsql;
DBMS_OUTPUT.PUT_LINE('Created Movies Table');
EXCEPTION
WHEN OTHERS THEN
DBMS_OUTPUT.PUT_LINE('Nothing to drop');
DBMS_OUTPUT.PUT_LINE('Creating Movies Table');
EXECUTE IMMEDIATE crtsql;
DBMS_OUTPUT.PUT_LINE('Created Movies Table');
END crt_movie_tbl;
/
--Procedure for loading movies table
CREATE OR REPLACE PROCEDURE insert_movie_tbl IS
CURSOR in_mov_tbl IS
SELECT COLUMN1, COLUMN2
FROM MOVIES_RAW;
title VARCHAR2(100);
yr VARCHAR2(4);
inssql VARCHAR2(1000):='INSERT INTO movies VALUES(:a, :b, :c)';
row_count BINARY_INTEGER;
BEGIN
DBMS_OUTPUT.PUT_LINE('Loading Movies Table');
FOR rw IN in_mov_tbl LOOP
title := trim(replace(trim(rw.column2),substr(trim(rw.column2),(instr(rw.column2,'(',-1)),6),''));
yr := substr(trim(rw.column2), (instr(trim(rw.column2),'(',-1))+1,4);
EXECUTE IMMEDIATE inssql
USING rw.column1, title, yr;
row_count := in_mov_tbl%ROWCOUNT;
END LOOP;
DBMS_OUTPUT.PUT_LINE(row_count || ' records loaded to Movies table');
END insert_movie_tbl;
/
--Procedure for creating movies-category table
CREATE OR REPLACE PROCEDURE crt_cat_tbl IS
drpsql VARCHAR2(1000):='DROP TABLE movie_cat CASCADE CONSTRAINTS PURGE';
crtsql VARCHAR2(1000):='CREATE TABLE movie_cat (movie_id VARCHAR2(4),
movie_cat VARCHAR2(47),
CONSTRAINT cat_pk PRIMARY KEY (movie_id, movie_cat))';
BEGIN
DBMS_OUTPUT.PUT_LINE('Dropping Movie-Category Table');
EXECUTE IMMEDIATE drpsql;
DBMS_OUTPUT.PUT_LINE('Creating Movie-Category Table');
EXECUTE IMMEDIATE crtsql;
DBMS_OUTPUT.PUT_LINE('Created Movie-Category Table');
EXCEPTION
WHEN OTHERS THEN --avoids error when running procedure without existing table
DBMS_OUTPUT.PUT_LINE('Nothing to drop');
DBMS_OUTPUT.PUT_LINE('Creating Movie-Category Table');
EXECUTE IMMEDIATE crtsql;
DBMS_OUTPUT.PUT_LINE('Created Movie-Category Table');
END crt_cat_tbl;
/
--Procedure for loading movie-category table
CREATE OR REPLACE PROCEDURE insert_cat_tbl IS
--Declaring Cursor which reads all records from raw movie input in order to create another able with
CURSOR split_genre IS
SELECT COLUMN1, COLUMN3
FROM MOVIES_RAW;
cat_count BINARY_INTEGER; --Used to count number of pipes in a given value
cat_val VARCHAR2(47); --Used to capture individual Genre/Category from multiple piped categories
inssql VARCHAR2(1000):='INSERT INTO movie_cat VALUES(:a, :b)';
row_count BINARY_INTEGER;
BEGIN
DBMS_OUTPUT.PUT_LINE('Loading Movie-Category Table');
FOR rw IN split_genre LOOP
cat_count := regexp_count(rw.column3,'\|',1); --Counts the number of times given pattern exists in value
IF cat_count = 0 THEN --When there's no pipes, value is atomic, no need to split
EXECUTE IMMEDIATE inssql
USING rw.column1, rw.column3;
ELSE
cat_val := REGEXP_SUBSTR(rw.column3, '^[^\|]+', 1); --Find first cat value from start of the col till first '|'
EXECUTE IMMEDIATE inssql
USING rw.column1, cat_val;
FOR i IN 1..cat_count LOOP --To capture remaining cat vals with given pattern
EXIT WHEN i = cat_count; --Last val needs special pattern as shown below
cat_val := REGEXP_SUBSTR(rw.column3, '[^\|]+', INSTR(rw.COLUMN3,'|',1,i),1);
EXECUTE IMMEDIATE inssql
USING rw.column1, cat_val;
END LOOP;
cat_val := REGEXP_SUBSTR(rw.column3, '[^\|]+$', INSTR(rw.COLUMN3,'|',1,cat_count),1); --Captures last cat in a col
EXECUTE IMMEDIATE inssql
USING rw.column1, cat_val;
END IF;
row_count := split_genre%ROWCOUNT;
END LOOP;
DBMS_OUTPUT.PUT_LINE(row_count || ' records processed');
END insert_cat_tbl;
/
--Procedure for creating age decode table
CREATE OR REPLACE PROCEDURE crt_age_tbl IS
drpsql VARCHAR2(1000):='DROP TABLE age_decode CASCADE CONSTRAINTS PURGE';
crtsql VARCHAR2(1000):='CREATE TABLE age_decode (age_code VARCHAR2(2) CONSTRAINT age_pk PRIMARY KEY NOT NULL,
age_group VARCHAR2(10))';
BEGIN
DBMS_OUTPUT.PUT_LINE('Dropping age_decode table');
EXECUTE IMMEDIATE drpsql;
DBMS_OUTPUT.PUT_LINE('Creating age_decode table');
EXECUTE IMMEDIATE crtsql;
DBMS_OUTPUT.PUT_LINE('Created age_decode table');
EXCEPTION
WHEN OTHERS THEN
DBMS_OUTPUT.PUT_LINE('Nothing to drop');
DBMS_OUTPUT.PUT_LINE('Creating age_decode table');
EXECUTE IMMEDIATE crtsql;
DBMS_OUTPUT.PUT_LINE('Created age_decode table');
END crt_age_tbl;
/
--Procedure for loading age decode table
CREATE OR REPLACE PROCEDURE insert_age_tbl IS
inssql VARCHAR2(1000):='INSERT INTO age_decode VALUES(:a,:b)';
BEGIN
DBMS_OUTPUT.PUT_LINE('Loading Age Table');
EXECUTE IMMEDIATE inssql
USING '1', 'Under 18';
EXECUTE IMMEDIATE inssql
USING '18', '18-24';
EXECUTE IMMEDIATE inssql
USING '25', '25-34';
EXECUTE IMMEDIATE inssql
USING '35', '35-44';
EXECUTE IMMEDIATE inssql
USING '45', '45-49';
EXECUTE IMMEDIATE inssql
USING '50', '50-55';
EXECUTE IMMEDIATE inssql
USING '56', '56+';
DBMS_OUTPUT.PUT_LINE('Age Table Loading Complete');
END insert_age_tbl;
/
--Procedure for creating occupation decode table
CREATE OR REPLACE PROCEDURE crt_occu_tbl IS
drpsql VARCHAR2(1000):='DROP TABLE occupation_decode CASCADE CONSTRAINTS PURGE';
crtsql VARCHAR2(1000):='CREATE TABLE occupation_decode (occupation_code VARCHAR2(2) CONSTRAINT occu_pk PRIMARY KEY NOT NULL,
occupation_group VARCHAR2(35)
)';
BEGIN
DBMS_OUTPUT.PUT_LINE('Dropping occupation_decode table');
EXECUTE IMMEDIATE drpsql;
DBMS_OUTPUT.PUT_LINE('Creating occupation_decode table');
EXECUTE IMMEDIATE crtsql;
DBMS_OUTPUT.PUT_LINE('Created occupation_decode table');
EXCEPTION
WHEN OTHERS THEN
DBMS_OUTPUT.PUT_LINE('Nothing to drop');
DBMS_OUTPUT.PUT_LINE('Creating occupation_decode table');
EXECUTE IMMEDIATE crtsql;
DBMS_OUTPUT.PUT_LINE('Created occupation_decode table');
END crt_occu_tbl;
/
--Procedure for loading occupation decode table
CREATE OR REPLACE PROCEDURE insert_occu_tbl IS
inssql VARCHAR2(1000):='INSERT INTO occupation_decode VALUES(:a,:b)';
BEGIN
DBMS_OUTPUT.PUT_LINE('Loading Occupation Table');
EXECUTE IMMEDIATE inssql
USING '0','other';
EXECUTE IMMEDIATE inssql
USING '1','academic/educator';
EXECUTE IMMEDIATE inssql
USING '2','artist';
EXECUTE IMMEDIATE inssql
USING '3','clerical/admin';
EXECUTE IMMEDIATE inssql
USING '4','college/grad student';
EXECUTE IMMEDIATE inssql
USING '5','customer service';
EXECUTE IMMEDIATE inssql
USING '6','doctor/health care';
EXECUTE IMMEDIATE inssql
USING '7','executive/managerial';
EXECUTE IMMEDIATE inssql
USING '8','farmer';
EXECUTE IMMEDIATE inssql
USING '9','homemaker';
EXECUTE IMMEDIATE inssql
USING '10','K-12 student';
EXECUTE IMMEDIATE inssql
USING '11','lawyer';
EXECUTE IMMEDIATE inssql
USING '12','programmer';
EXECUTE IMMEDIATE inssql
USING '13','retired';
EXECUTE IMMEDIATE inssql
USING '14','sales/marketing';
EXECUTE IMMEDIATE inssql
USING '15','scientist';
EXECUTE IMMEDIATE inssql
USING '16','self-employed';
EXECUTE IMMEDIATE inssql
USING '17','technician/engineer';
EXECUTE IMMEDIATE inssql
USING '18','tradesman/craftsman';
EXECUTE IMMEDIATE inssql
USING '19','unemployed';
EXECUTE IMMEDIATE inssql
USING '20','writer';
DBMS_OUTPUT.PUT_LINE('Occupation Table Loading Complete');
END insert_occu_tbl;
/
--Calling procedures to create the DB and load data
call crt_occu_tbl();
call insert_occu_tbl();
call crt_age_tbl();
call insert_age_tbl();
call crt_movie_tbl();
call insert_movie_tbl();
call crt_cat_tbl();
call insert_cat_tbl();
call crt_usr_tbl();
call insert_usr_tbl();
call crt_rat_tbl();
call insert_rat_tbl();
--Analysis queries
--Below query identifies gender and age group of users with ratings data
--Based on the output of the below query, only 25% of the ratings were by Females
SELECT a.gender, b.age_group, count(c.rating) rating_cnts
FROM users a
INNER JOIN age_decode b
on (a.age = b.age_code)
LEFT JOIN ratings c
on (a.user_id = c.user_id)
group by a.gender, b.age_group
;
--Below query identifies movie categories highly rated by male respondants
--Drama, Comedy, and Action ranked highly
SELECT b.movie_cat, count(c.user_id) user_cnt
FROM ratings a
INNER JOIN movie_cat b
ON (a.movie_id = b.movie_id)
INNER JOIN users c
ON (a.user_id = c.user_id)
WHERE c.gender = 'M'
AND a.rating > 3
GROUP BY b.movie_cat
ORDER BY 1;
--Below query identifies movie categories highly rated by male respondants
--Drama, Comedy, and Romance ranked highly
SELECT b.movie_cat, count(c.user_id) user_cnt
FROM ratings a
INNER JOIN movie_cat b
ON (a.movie_id = b.movie_id)
INNER JOIN users c
ON (a.user_id = c.user_id)
WHERE c.gender = 'F'
AND a.rating > 3
GROUP BY b.movie_cat
ORDER BY 1;