machine-learning-python-template/src/app.py at 8e741632050c4462f171e288307e7110cc1313f8 · 4GeeksAcademy/machine-learning-python-template · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from utils import db_connect
engine = db_connect()

# your code here

# Step 0: Import Libraries and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#Herramienta de Machine Learning
from sklearn.model_selection import train_test_split

# Step 1: Load data
url='https://breathecode.herokuapp.com/asset/internal-link?id=927&path=AB_NYC_2019.csv'
df_raw = pd.read_csv(url)

df_raw.sample(10, random_state=2025)

# Step 2: Reprocessing
df_baking = df_raw.copy()
df_baking = df_baking.drop(columns=['id','name', 'host_name', 'last_review', 'reviews_per_month','latitude','longitude'])
columnsCategory = ['host_id','neighbourhood_group','neighbourhood','room_type']
df_baking[columnsCategory] = df_baking[columnsCategory].astype('category')
df = df_baking.copy()
df.info()
df_raw.sample(5, random_state=2025)

# Step 3: EDA
df_train, df_test =  train_test_split(df, test_size=0.1, random_state= 2025)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

df_train.shape, df_test.shape

display(df_train.describe(include = 'number').T)
display(df_train.describe(include = 'category').T)

#Análisis univariado
df_train.hist()
plt.tight_layout()
plt.show()

sns.countplot(data= df_train, x='host_id')
plt.show()

sns.countplot(data= df_train, x='neighbourhood_group')
plt.show()

sns.countplot(data= df_train, x='neighbourhood')
plt.show()

sns.countplot(data= df_train, x='room_type')
plt.show()

#Análisis bivariado para datos númericos
sns.pairplot(data = df_train, corner= True)
plt.show()

#comparamos numericos contra categoricos
sns.pairplot(df_train, hue='neighbourhood_group', corner=True)
plt.show()

#Comparamos categorico contra cateogtrico
sns.countplot(df_train, x = "room_type", hue = "neighbourhood_group")
plt.show()