-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathedautil.py
More file actions
57 lines (48 loc) · 1.96 KB
/
edautil.py
File metadata and controls
57 lines (48 loc) · 1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import pandas as pd
import datetime as dt
from pathlib import Path
def eda_open(file='data.csv'):
"""
This function reads a CSV file into a pandas DataFrame and creates a backup
of the file with the current date in a 'backups' directory if a backup for
today does not already exist.
Parameters:
file (str): The path to the CSV file to be read. Default is 'data.csv'.
Returns:
pd.DataFrame: The DataFrame containing the data from the CSV file.
"""
df = pd.read_csv(file)
today = dt.datetime.today().strftime('%d_%m_%Y')
Path("backups").mkdir(parents=True, exist_ok=True)
file_list = list(Path("backups").iterdir())
for i in Path('backups').iterdir():
file_list.append(i.name)
if f"backup_{today}.csv" in file_list:
with (Path("backups") / f"backup_{today}.csv").open("a") as f:
f.write(f"\n{'='*100}\n{dt.datetime.now()}\n{'='*100}\n")
df.to_csv(f, index=False)
print("Appended new data to the existing backup.")
return df
else:
with (Path("backups") / f"backup_{today}.csv").open("x") as f:
f.write(f"{'='*100}\n{dt.datetime.now()}\n{'='*100}\n")
df.to_csv(f, index=False)
print("Created a new backup.")
return df
def eda_nan_check(df):
"""
This function takes a pandas DataFrame as input and returns a DataFrame
showing the count and percentage of missing values for each column.
Parameters:
df (pd.DataFrame): The input DataFrame to check for missing values.
Returns:
pd.DataFrame: A DataFrame with columns 'Missing Values' and 'Percentage'
indicating the count and percentage of missing values per column.
"""
missing_count = df.isnull().sum()
missing_percentage = (missing_count / len(df)) * 100
missing_df = pd.DataFrame({
'Missing Values': missing_count,
'Percentage': missing_percentage
})
return missing_df[missing_df['Missing Values'] > 0]