forked from beat-b/CProjectG6
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_LisbonTrip.py
More file actions
50 lines (35 loc) · 1.96 KB
/
clean_LisbonTrip.py
File metadata and controls
50 lines (35 loc) · 1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pandas as pd
from data_utils import *
# Load the CSV file into a DataFrame
trip_df = pd.read_csv('./data/LisbonTrip_Final.csv')
# Extract a value from the 'Link' column to create a new 'Ranking' column
trip_df['Ranking'] = trip_df['Link'].str.split('.').str[0]
# Convert the 'Ranking' column to numeric values
trip_df['Ranking'] = pd.to_numeric(trip_df['Ranking'], errors='coerce')
# Drop unnecessary columns
trip_df.drop(['web-scraper-order', 'web-scraper-start-url', 'Link'], axis=1, inplace=True)
# Set the 'Ranking' column as the index
trip_df.set_index('Ranking', inplace=True)
# Sort the DataFrame based on the index
trip_df.sort_index(inplace=True)
# Apply the function to create four new columns: 'StartTime', 'EndTime', 'LunchStart', and 'LunchEnd'
trip_df[['StartTime', 'EndTime', 'LunchStart', 'LunchEnd']] = trip_df['Schedule'].apply(extract_times).apply(pd.Series)
# Drop rows of closed activities
trip_df = trip_df[trip_df['Schedule'] != 'Closed until further notice']
# Change rankings
trip_df.reset_index(drop=True, inplace=True)
# Drop the original 'Schedule' column
trip_df.drop('Schedule', axis=1, inplace=True)
# Apply the function to extract types from the 'Types' column
trip_df['Types'] = trip_df['Types'].apply(select_types)
# Apply the duration_time function to create two new columns: 'DurationMin' and 'DurationMax'
trip_df[['DurationMin', 'DurationMax']] = trip_df['Duration'].apply(duration_time).tolist()
# Apply the to_number function to the convert column type to numeric
trip_df['ReviewsNo'] = trip_df['ReviewsNo'].apply(to_number)
trip_df['ExcellentRating'] = trip_df['ExcellentRating'].apply(to_number)
trip_df['VeryGoodRating'] = trip_df['VeryGoodRating'].apply(to_number)
trip_df['AverageRating'] = trip_df['AverageRating'].apply(to_number)
trip_df['PoorRating'] = trip_df['PoorRating'].apply(to_number)
trip_df['TerribleRating'] = trip_df['TerribleRating'].apply(to_number)
# Export csv
trip_df.to_csv('./data/cleanTripLisbon.csv', index=True)