diff --git a/requirements.txt b/requirements.txt index 3a2112c..046399e 100755 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ tqdm>=4.67.3 wget>=3.2 ty ruff +beautifulsoup4 diff --git a/swvo/io/RBMDataSet/RBMDataSet.py b/swvo/io/RBMDataSet/RBMDataSet.py index 670934a..5955e99 100644 --- a/swvo/io/RBMDataSet/RBMDataSet.py +++ b/swvo/io/RBMDataSet/RBMDataSet.py @@ -464,7 +464,7 @@ def get_loaded_variables(self) -> list[str]: loaded_vars.append(var.var_name) return loaded_vars - def __eq__(self, other: RBMDataSet) -> bool: # type :ignore[override] + def __eq__(self, other: RBMDataSet) -> bool: # ty :ignore[invalid-method-override] if ( self._file_loading_mode != other._file_loading_mode or self._satellite != other._satellite diff --git a/swvo/io/f10_7/swpc.py b/swvo/io/f10_7/swpc.py index 4428a60..227b8ae 100644 --- a/swvo/io/f10_7/swpc.py +++ b/swvo/io/f10_7/swpc.py @@ -255,7 +255,7 @@ def read(self, start_time: datetime, end_time: datetime, *, download: bool = Fal data_out = df_one_file.combine_first(data_out) if not data_out.empty: - data_out.index = enforce_utc_timezone(data_out.index) # ty: ignore[no-matching-overload] + data_out.index = enforce_utc_timezone(data_out.index) data_out.drop("date", axis=1, inplace=True) data_out = data_out.truncate( before=start_time - timedelta(hours=23.9999), diff --git a/swvo/io/hp/gfz.py b/swvo/io/hp/gfz.py index 9338e99..7a4173f 100755 --- a/swvo/io/hp/gfz.py +++ b/swvo/io/hp/gfz.py @@ -305,8 +305,7 @@ def _process_single_file(self, temp_dir: Path, start_time: datetime) -> pd.DataF {f"Hp{self.index_number}": json_data[f"Hp{self.index_number}"]}, index=pd.to_datetime(json_data["datetime"], utc=True), ) - data_total.index = enforce_utc_timezone(data_total.index) # ty: ignore[no-matching-overload] - + data_total.index = enforce_utc_timezone(data_total.index) return data_total def _read_single_file(self, file_path: str) -> pd.DataFrame: diff --git a/swvo/io/kp/__init__.py b/swvo/io/kp/__init__.py index 9549a15..8034a7c 100755 --- a/swvo/io/kp/__init__.py +++ b/swvo/io/kp/__init__.py @@ -6,6 +6,8 @@ from swvo.io.kp.niemegk import KpNiemegk as KpNiemegk from swvo.io.kp.omni import KpOMNI as KpOMNI from swvo.io.kp.swpc import KpSWPC as KpSWPC +from swvo.io.kp.sidc import KpSIDC as KpSIDC +from swvo.io.kp.bgs import KpBGS as KpBGS # This has to be imported after the models to avoid a circular import from swvo.io.kp.read_kp_from_multiple_models import read_kp_from_multiple_models as read_kp_from_multiple_models # noqa: I001 diff --git a/swvo/io/kp/bgs.py b/swvo/io/kp/bgs.py new file mode 100755 index 0000000..4cf639e --- /dev/null +++ b/swvo/io/kp/bgs.py @@ -0,0 +1,340 @@ +# SPDX-FileCopyrightText: 2026 GFZ Helmholtz Centre for Geosciences +# SPDX-FileContributor: Sahil Jhawar +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Module for handling BGS Kp data. +""" + +import logging +from datetime import datetime, timedelta, timezone +from pathlib import Path +from shutil import rmtree +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd +import requests +from bs4 import BeautifulSoup + +from swvo.io.base import BaseIO +from swvo.io.utils import enforce_utc_timezone + +logger = logging.getLogger(__name__) + +logging.captureWarnings(True) + + +class KpBGS(BaseIO): + """A class to handle BGS Kp data. + + Parameters + ---------- + data_dir : Path | None + Data directory for the BGS Kp data. If not provided, it will be read from the environment variable + + Methods + ------- + download_and_process + read + + Raises + ------ + ValueError + Returns `ValueError` if necessary environment variable is not set. + """ + + ENV_VAR_NAME = "RT_KP_BGS_STREAM_DIR" + + URL = "https://geomag.bgs.ac.uk/cgi-bin/solar" + NAME = "kp.html" + + LABEL = "bgs" + + def download_and_process(self, request_time: Optional[datetime] = None, reprocess_files: bool = False) -> None: + """Download and process BGS Kp data file for a specific month. + + Parameters + ---------- + request_time : Optional[datetime] + Time for which to download and process data (month and year are extracted). + reprocess_files : bool, optional + Downloads and processes the files again, defaults to False, by default False + + Raises + ------ + FileNotFoundError + Raise `FileNotFoundError` if the file is not downloaded successfully. + """ + + if request_time is None: + request_time = datetime.now(timezone.utc) + + request_time = enforce_utc_timezone(request_time) + + temporary_dir = Path("./temp_kp_bgs_wget") + temporary_dir.mkdir(exist_ok=True, parents=True) + + logger.debug(f"Downloading file {self.URL} ...") + + file_path = self.data_dir / request_time.strftime("%Y") / f"BGS_KP_FORECAST_{request_time.strftime('%Y%m')}.csv" + + if file_path.exists() and not reprocess_files: + return + + tmp_path = file_path.with_suffix(file_path.suffix + ".tmp") + try: + logger.info(f"Downloading file for {request_time.strftime('%Y-%m')} from {self.URL}") + html_file = temporary_dir / f"{self.NAME}" + self._download(html_file, request_time) + if not html_file.exists() or html_file.stat().st_size == 0: + raise FileNotFoundError(f"Error while downloading file: {self.URL}!") + + logger.debug("Processing file ...") + processed_df = self._process_single_file(html_file) + + if len(processed_df.index) == 0: + return + + file_path.parent.mkdir(parents=True, exist_ok=True) + processed_df.to_csv(tmp_path, index=True, header=False) + tmp_path.replace(file_path) + + logger.debug(f"Saving processed file {file_path}") + except Exception as e: + logger.error(f"Failed to process {file_path}: {e}") + if tmp_path.exists(): + tmp_path.unlink() + return + + rmtree(temporary_dir, ignore_errors=True) + + def _download(self, temporary_dir: Path, request_time: datetime) -> None: + """Download BGS Kp data for a specific month. + + Parameters + ---------- + temporary_dir : Path + Path to save the temporary downloaded file. + request_time : datetime + Time for which to download data (month and year are extracted). + """ + + payload = { + "name": "not given", + "solar_geo": "1", + "month": str(request_time.month), + "year": str(request_time.year), + } + + response = requests.post(self.URL, data=payload) + response.raise_for_status() + + with open(temporary_dir, "w") as f: + f.write(response.text) + + def read( + self, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, download: bool = False + ) -> pd.DataFrame: + """Read BGS Kp data for the specified time range. + + Parameters + ---------- + start_time : Optional[datetime] + Start time of the data to read. + end_time : Optional[datetime] + End time of the data to read. + download : bool, optional + Download data on the go, defaults to False. + + Returns + ------- + :class:`pandas.DataFrame` + BGS Kp dataframe. + """ + + if start_time is None: + start_time = datetime.now(timezone.utc) + + if end_time is None: + end_time = datetime.now(timezone.utc) + timedelta(days=2) + + if start_time >= end_time: + msg = "start_time must be before end_time" + logger.error(msg) + raise ValueError(msg) + + start_time = enforce_utc_timezone(start_time) + end_time = enforce_utc_timezone(end_time) + + logger.info(f"Reading data from {start_time} to {end_time}") + + file_paths, time_intervals = self._get_processed_file_list(start_time, end_time) + + # Download data for every month if download is True + if download: + for time_interval in time_intervals: + self.download_and_process(time_interval[0]) + + # initialize data frame with NaNs + t = pd.date_range( + datetime(start_time.year, start_time.month, start_time.day), + datetime(end_time.year, end_time.month, end_time.day, 23, 59, 59), + freq=timedelta(hours=3), + ) + data_out = pd.DataFrame(index=t) + data_out.index = enforce_utc_timezone(data_out.index) + data_out["kp"] = np.array([np.nan] * len(t)) + data_out["file_name"] = np.array([None] * len(t)) + + for file_path, time_interval in zip(file_paths, time_intervals): + if not file_path.exists(): + logger.warning(f"File {file_path} not found") + logger.warning(f"Data not available from {time_interval[0]} to {time_interval[1]}") + continue + logger.info(f"Reading data from {file_path}") + df_one_file = self._read_single_file(file_path) + + # combine the new file with the old ones, replace all values present in df_one_file in data_out + data_out = df_one_file.combine_first(data_out) + + data_out = data_out.truncate( + before=start_time - timedelta(hours=2.9999), + after=end_time + timedelta(hours=2.9999), + ) + + return data_out + + def _get_processed_file_list(self, start_time: datetime, end_time: datetime) -> Tuple[List, List]: + """Get list of file paths and their corresponding time intervals for monthly files. + + Returns + ------- + Tuple[List, List] + List of file paths and tuples containing (start_time, end_time) for each month. + """ + file_paths = [] + time_intervals = [] + + current_time = datetime( + start_time.year, + start_time.month, + 1, + 0, + 0, + 0, + tzinfo=timezone.utc, + ) + + if end_time.month == 12: + end_of_period = datetime(end_time.year + 1, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + else: + end_of_period = datetime(end_time.year, end_time.month + 1, 1, 0, 0, 0, tzinfo=timezone.utc) + + while current_time < end_of_period: + file_path = ( + self.data_dir / current_time.strftime("%Y") / f"BGS_KP_FORECAST_{current_time.strftime('%Y%m')}.csv" + ) + file_paths.append(file_path) + + month_start = current_time + if current_time.month == 12: + month_end = datetime( + current_time.year + 1, + 1, + 1, + 0, + 0, + 0, + tzinfo=timezone.utc, + ) + else: + month_end = datetime( + current_time.year, + current_time.month + 1, + 1, + 0, + 0, + 0, + tzinfo=timezone.utc, + ) + month_end -= timedelta(seconds=1) + time_intervals.append((month_start, month_end)) + + if current_time.month == 12: + current_time = datetime(current_time.year + 1, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + else: + current_time = datetime(current_time.year, current_time.month + 1, 1, 0, 0, 0, tzinfo=timezone.utc) + + return file_paths, time_intervals + + def _read_single_file(self, file_path) -> pd.DataFrame: + """Read BGS Kp file to a DataFrame. + + Parameters + ---------- + file_path : Path + Path to the file. + + Returns + ------- + pd.DataFrame + Data from BGS Kp file. + """ + df = pd.read_csv(file_path, names=["t", "kp"]) + + df["t"] = pd.to_datetime(df["t"]) + df.index = df["t"] + df.drop(labels=["t"], axis=1, inplace=True) + df.index = enforce_utc_timezone(df.index) + + df["file_name"] = file_path + df.loc[df["kp"].isna(), "file_name"] = None + + return df + + def _process_single_file(self, temporary_dir: Path) -> pd.DataFrame: + """Process BGS Kp file to a DataFrame. + + Parameters + ---------- + file_path : Path + Path to the file. + + Returns + ------- + pd.DataFrame + BGS Kp data. + """ + with open(temporary_dir, "r") as f: + content = f.read() + soup = BeautifulSoup(content, "html.parser") + table = soup.find("table") + + if not table: + msg = f"No table found in response from {self.URL}" + logger.error(msg) + raise ValueError(msg) + + rows = table.find_all("tr") + records = [] + + for row in rows[1:]: # skip header row + cols = row.find_all("td") + if len(cols) >= 9: + date = pd.to_datetime(cols[0].text.strip(), format="%d-%m-%y") + + for i, hour in enumerate(range(0, 24, 3)): + kp = self._snap_to_kp_scale(int(cols[i + 1].text.strip()) / 10) + timestamp = date + pd.Timedelta(hours=hour) + records.append({"t": timestamp, "Kp": kp}) + + df = pd.DataFrame(records).set_index("t") + return df + + def _snap_to_kp_scale(self, value: float) -> float: + KP_SCALE = np.linspace(0, 9, 28) + + idx = np.argmin(np.abs(KP_SCALE - value)) + return round(float(KP_SCALE[idx]), 3) diff --git a/swvo/io/kp/ensemble.py b/swvo/io/kp/ensemble.py index a42370a..f173189 100755 --- a/swvo/io/kp/ensemble.py +++ b/swvo/io/kp/ensemble.py @@ -122,7 +122,7 @@ def read(self, start_time: datetime, end_time: datetime) -> list[pd.DataFrame]: freq=timedelta(hours=3), ) data_out = pd.DataFrame(index=t) - data_out.index = enforce_utc_timezone(data_out.index) # ty: ignore[no-matching-overload] + data_out.index = enforce_utc_timezone(data_out.index) data_out["kp"] = np.array([np.nan] * len(t)) data_out = data_out.truncate( before=start_time - timedelta(hours=2.9999), @@ -144,7 +144,7 @@ def read(self, start_time: datetime, end_time: datetime) -> list[pd.DataFrame]: df["file_name"] = file df.loc[df["kp"].isna(), "file_name"] = None - df.index = enforce_utc_timezone(df.index) # ty: ignore[no-matching-overload] + df.index = enforce_utc_timezone(df.index) df = df.truncate( before=start_time - timedelta(hours=2.9999), diff --git a/swvo/io/kp/niemegk.py b/swvo/io/kp/niemegk.py index 5dd13db..d2849d5 100755 --- a/swvo/io/kp/niemegk.py +++ b/swvo/io/kp/niemegk.py @@ -156,7 +156,7 @@ def read(self, start_time: datetime, end_time: datetime, download: bool = False) freq=timedelta(hours=3), ) data_out = pd.DataFrame(index=t) - data_out.index = enforce_utc_timezone(data_out.index) # ty: ignore[no-matching-overload] + data_out.index = enforce_utc_timezone(data_out.index) data_out["kp"] = np.array([np.nan] * len(t)) data_out["file_name"] = np.array([None] * len(t)) @@ -247,7 +247,7 @@ def _read_single_file(self, file_path) -> pd.DataFrame: df["t"] = pd.to_datetime(df["t"]) df.index = df["t"] df.drop(labels=["t"], axis=1, inplace=True) - df.index = enforce_utc_timezone(df.index) # ty: ignore[no-matching-overload] + df.index = enforce_utc_timezone(df.index) df["file_name"] = file_path df.loc[df["kp"].isna(), "file_name"] = None @@ -294,7 +294,7 @@ def _process_single_file(self, temporary_dir: Path) -> pd.DataFrame: ) data.index.rename("t", inplace=True) data.index = data["t"] - data.index = enforce_utc_timezone(data.index) # ty: ignore[no-matching-overload] + data.index = enforce_utc_timezone(data.index) data.drop(labels=["t"], axis=1, inplace=True) data.dropna(inplace=True) data = data[data["kp"] != -1.0] diff --git a/swvo/io/kp/read_kp_from_multiple_models.py b/swvo/io/kp/read_kp_from_multiple_models.py index 0ba8e34..8d08cee 100644 --- a/swvo/io/kp/read_kp_from_multiple_models.py +++ b/swvo/io/kp/read_kp_from_multiple_models.py @@ -16,7 +16,7 @@ import pandas as pd from swvo.io.exceptions import ModelError -from swvo.io.kp import KpEnsemble, KpNiemegk, KpOMNI, KpSWPC +from swvo.io.kp import KpBGS, KpEnsemble, KpNiemegk, KpOMNI, KpSIDC, KpSWPC from swvo.io.utils import ( any_nans, construct_updated_data_frame, @@ -25,7 +25,7 @@ logger = logging.getLogger(__name__) -KpModel = KpEnsemble | KpNiemegk | KpOMNI | KpSWPC +KpModel = KpEnsemble | KpNiemegk | KpOMNI | KpSWPC | KpBGS | KpSIDC logging.captureWarnings(True) @@ -39,7 +39,7 @@ def read_kp_from_multiple_models( # noqa: PLR0913 *, download: bool = False, recurrence: bool = False, - rec_model_order: Sequence[KpOMNI | KpNiemegk] | None = None, + rec_model_order: Sequence[KpOMNI | KpNiemegk | KpBGS] | None = None, ) -> pd.DataFrame | list[pd.DataFrame]: """Read Kp data from multiple models. @@ -67,7 +67,7 @@ def read_kp_from_multiple_models( # noqa: PLR0913 recurrence : bool, optional If True, fill missing values using 27-day recurrence from historical models (OMNI, Niemegk). Defaults to False. - rec_model_order : Sequence[KpOMNI | KpNiemegk], optional + rec_model_order : Sequence[KpOMNI | KpNiemegk | KpBGS], optional The order in which historical models will be used for 27-day recurrence filling. Defaults to [OMNI, Niemegk]. @@ -113,7 +113,7 @@ def read_kp_from_multiple_models( # noqa: PLR0913 if recurrence: if rec_model_order is None: - rec_model_order = [m for m in model_order if isinstance(m, (KpOMNI, KpNiemegk))] + rec_model_order = [m for m in model_order if isinstance(m, (KpOMNI, KpNiemegk, KpBGS))] for i, df in enumerate(data_out): if not df.empty: data_out[i] = _recursive_fill_27d_historical(df, download, rec_model_order) @@ -157,7 +157,7 @@ def _read_from_model( # noqa: PLR0913 """ # Read from historical models - if isinstance(model, (KpOMNI, KpNiemegk)): + if isinstance(model, (KpOMNI, KpNiemegk, KpBGS)): data_one_model = _read_historical_model( model, start_time, @@ -167,9 +167,9 @@ def _read_from_model( # noqa: PLR0913 ) # Forecasting models are called with synthetic now time - if isinstance(model, KpSWPC): + if isinstance(model, (KpSWPC, KpSIDC)): logger.info( - f"Reading swpc from {historical_data_cutoff_time.replace(hour=0, minute=0, second=0)} to {end_time}\noriginal historical_data_cutoff_time: {historical_data_cutoff_time}" + f"Reading {model.LABEL} from {historical_data_cutoff_time.replace(hour=0, minute=0, second=0)} to {end_time}\noriginal historical_data_cutoff_time: {historical_data_cutoff_time}" ) data_one_model = [ model.read( @@ -191,18 +191,18 @@ def _read_from_model( # noqa: PLR0913 def _read_historical_model( - model: KpOMNI | KpNiemegk, + model: KpOMNI | KpNiemegk | KpBGS, start_time: datetime, end_time: datetime, historical_data_cutoff_time: datetime, *, download: bool, ) -> pd.DataFrame: - """Reads Kp data from historical models (KpOMNI or KpNiemegk) within the specified time range. + """Reads Kp data from historical models (KpOMNI or KpNiemegk or KpBGS) within the specified time range. Parameters ---------- - model : KpOMNI | KpNiemegk + model : KpOMNI | KpNiemegk | KpBGS The historical model from which to read the data. start_time : datetime The start time of the data range. @@ -221,10 +221,10 @@ def _read_historical_model( Raises ------ TypeError - If the provided model is not an instance of KpOMNI or KpNiemegk. + If the provided model is not an instance of KpOMNI or KpNiemegk or KpBGS. """ - if not isinstance(model, (KpOMNI, KpNiemegk)): + if not isinstance(model, (KpOMNI, KpNiemegk, KpBGS)): msg = "Encountered invalide model type in read historical model!" raise TypeError(msg) @@ -355,7 +355,7 @@ def _recursive_fill_27d_historical(df, download, historical_models): DataFrame to fill with gaps. download : bool Download new data or not. - historical_models : list[KpOMNI | KpNiemegk] + historical_models : list[KpOMNI | KpNiemegk | KpBGS] List of historical models to use for filling gaps. value_col : str, optional _description_, by default "kp" diff --git a/swvo/io/kp/sidc.py b/swvo/io/kp/sidc.py new file mode 100755 index 0000000..2810f7b --- /dev/null +++ b/swvo/io/kp/sidc.py @@ -0,0 +1,327 @@ +# SPDX-FileCopyrightText: 2026 GFZ Helmholtz Centre for Geosciences +# SPDX-FileContributor: Sahil Jhawar +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Module for handling SIDC Kp data. +""" + +import logging +import warnings +from datetime import datetime, timedelta, timezone +from pathlib import Path +from shutil import rmtree +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd +import requests + +from swvo.io.base import BaseIO +from swvo.io.utils import enforce_utc_timezone + +logger = logging.getLogger(__name__) + +logging.captureWarnings(True) + + +class KpSIDC(BaseIO): + """A class to handle SIDC Kp data. + + Parameters + ---------- + data_dir : Path | None + Data directory for the SIDC Kp data. If not provided, it will be read from the environment variable + + Methods + ------- + download_and_process + read + + Raises + ------ + ValueError + Returns `ValueError` if necessary environment variable is not set. + """ + + ENV_VAR_NAME = "FC_KP_SIDC_STREAM_DIR" + + URL = "https://ssa.sidc.be/prod/API/index.php?component=latest&pc=G158&psc=a" + NAME = "kp.json" + + LABEL = "sidc" + + def download_and_process( + self, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, reprocess_files: bool = False + ) -> None: + """Download and process SIDC Kp data file. + + Parameters + ---------- + start_time : Optional[datetime] + Start time of the data to download and process. + end_time : Optional[datetime] + End time of the data to download and process. + reprocess_files : bool, optional + Downloads and processes the files again, defaults to False, by default False + + Raises + ------ + FileNotFoundError + Raise `FileNotFoundError` if the file is not downloaded successfully. + """ + + if start_time is None: + start_time = datetime.now(timezone.utc) + + if end_time is None: + end_time = datetime.now(timezone.utc) + timedelta(days=2) + + start_time = enforce_utc_timezone(start_time) + end_time = enforce_utc_timezone(end_time) + + if start_time >= end_time: + msg = "start_time must be before end_time" + logger.error(msg) + raise ValueError(msg) + + temporary_dir = Path("./temp_kp_sidc_wget") + temporary_dir.mkdir(exist_ok=True, parents=True) + + logger.debug(f"Downloading file {self.URL} ...") + + file_paths, time_intervals = self._get_processed_file_list(start_time, end_time) + for num, (file_path, time_interval) in enumerate(zip(file_paths, time_intervals)): + if file_path.exists() and not reprocess_files: + continue + tmp_path = file_path.with_suffix(file_path.suffix + ".tmp") + try: + # check if download was successfull + json_file = temporary_dir / f"{self.NAME}" + self._download(json_file, time_interval[0], time_interval[1]) + if not json_file.exists() or json_file.stat().st_size == 0: + raise FileNotFoundError(f"Error while downloading file: {self.URL}!") + + logger.debug("Processing file ...") + processed_df = self._process_single_file(json_file) + data_single_file = processed_df[ + (processed_df.index >= time_interval[0]) & (processed_df.index <= time_interval[1]) + ] + + if len(data_single_file.index) == 0: + continue + + file_path.parent.mkdir(parents=True, exist_ok=True) + data_single_file.to_csv(tmp_path, index=True, header=False) + tmp_path.replace(file_path) + + logger.debug(f"Saving processed file {file_path}") + except Exception as e: + logger.error(f"Failed to process {file_path}: {e}") + if tmp_path.exists(): + tmp_path.unlink() + continue + + rmtree(temporary_dir, ignore_errors=True) + + def _download(self, temporary_dir, start_time: datetime, end_time: datetime) -> None: + response = requests.get( + self.URL + + f"&dts_start={start_time.strftime('%Y-%m-%dT%H:%M:%SZ')}&dts_end={(end_time + timedelta(seconds=1)).strftime('%Y-%m-%dT%H:%M:%SZ')}" + ) + response.raise_for_status() + + with open(temporary_dir, "w") as f: + f.write(response.text) + + def read( + self, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, download: bool = False + ) -> pd.DataFrame: + """Read SIDC Kp data for the specified time range. + + Parameters + ---------- + start_time : Optional[datetime] + Start time of the data to read. + end_time : Optional[datetime] + End time of the data to read. + download : bool, optional + Download data on the go, defaults to False. + + Returns + ------- + :class:`pandas.DataFrame` + SIDC Kp dataframe. + """ + + if start_time is None: + start_time = datetime.now(timezone.utc) + + if end_time is None: + end_time = datetime.now(timezone.utc) + timedelta(days=2) + + if start_time >= end_time: + msg = "start_time must be before end_time" + logger.error(msg) + raise ValueError(msg) + + start_time = enforce_utc_timezone(start_time) + end_time = enforce_utc_timezone(end_time) + + file_paths, time_intervals = self._get_processed_file_list(start_time, end_time) + + # initialize data frame with NaNs + t = pd.date_range( + datetime(start_time.year, start_time.month, start_time.day), + datetime(end_time.year, end_time.month, end_time.day, 23, 59, 59), + freq=timedelta(hours=3), + ) + data_out = pd.DataFrame(index=t) + data_out.index = enforce_utc_timezone(data_out.index) + data_out["kp"] = np.array([np.nan] * len(t)) + data_out["file_name"] = np.array([None] * len(t)) + + for file_path, time_interval in zip(file_paths, time_intervals): + if not file_path.exists(): + if download: + self.download_and_process(start_time, end_time) + + # if we request a date in the future, the file will still not be found here + if not file_path.exists(): + warnings.warn(f"File {file_path} not found") + continue + df_one_file = self._read_single_file(file_path) + + # combine the new file with the old ones, replace all values present in df_one_file in data_out + data_out = df_one_file.combine_first(data_out) + + data_out = data_out.truncate( + before=start_time - timedelta(hours=2.9999), + after=end_time + timedelta(hours=2.9999), + ) + + return data_out + + def _get_processed_file_list(self, start_time: datetime, end_time: datetime) -> Tuple[List, List]: + """Get list of file paths and their corresponding time intervals for monthly files. + + Returns + ------- + Tuple[List, List] + List of file paths and tuples containing (start_time, end_time) for each month. + """ + file_paths = [] + time_intervals = [] + + # Start from the first day of the month containing start_time + current_time = datetime( + start_time.year, + start_time.month, + 1, + 0, + 0, + 0, + tzinfo=timezone.utc, + ) + + # End at the last day of the month containing end_time + if end_time.month == 12: + end_of_period = datetime(end_time.year + 1, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + else: + end_of_period = datetime(end_time.year, end_time.month + 1, 1, 0, 0, 0, tzinfo=timezone.utc) + + while current_time < end_of_period: + file_path = ( + self.data_dir / current_time.strftime("%Y") / f"SIDC_KP_FORECAST_{current_time.strftime('%Y%m')}.csv" + ) + file_paths.append(file_path) + + # Create interval covering the entire month + month_start = current_time + if current_time.month == 12: + month_end = datetime( + current_time.year + 1, + 1, + 1, + 0, + 0, + 0, + tzinfo=timezone.utc, + ) + else: + month_end = datetime( + current_time.year, + current_time.month + 1, + 1, + 0, + 0, + 0, + tzinfo=timezone.utc, + ) + month_end -= timedelta(seconds=1) # Set to the last second of the month + time_intervals.append((month_start, month_end)) + + # Move to next month + if current_time.month == 12: + current_time = datetime(current_time.year + 1, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + else: + current_time = datetime(current_time.year, current_time.month + 1, 1, 0, 0, 0, tzinfo=timezone.utc) + + return file_paths, time_intervals + + def _read_single_file(self, file_path) -> pd.DataFrame: + """Read SIDC Kp file to a DataFrame. + + Parameters + ---------- + file_path : Path + Path to the file. + + Returns + ------- + pd.DataFrame + Data from SIDC Kp file. + """ + df = pd.read_csv(file_path, names=["t", "kp"]) + + df["t"] = pd.to_datetime(df["t"]) + df.index = df["t"] + df.drop(labels=["t"], axis=1, inplace=True) + df.index = enforce_utc_timezone(df.index) + + df["file_name"] = file_path + df.loc[df["kp"].isna(), "file_name"] = None + + return df + + def _process_single_file(self, temporary_dir: Path) -> pd.DataFrame: + """Process SIDC Kp file to a DataFrame. + + Parameters + ---------- + file_path : Path + Path to the file. + + Returns + ------- + pd.DataFrame + SIDC Kp data. + """ + + data = pd.read_json(temporary_dir) + data = pd.json_normalize(data["data"]) + + data = data.apply(lambda x: pd.to_datetime(x) if x.name != "value" else x) + + data.sort_values(["issue_time", "start_time"], inplace=True) + + data.drop(columns=["end_time"], inplace=True) + + data = data.loc[data.groupby("start_time")["issue_time"].idxmax(), ["start_time", "value"]] + data.rename(columns={"start_time": "t", "value": "Kp"}, inplace=True) + data.index = data["t"] + data.index = enforce_utc_timezone(data.index) + data.drop(columns=["t"], inplace=True) + + return data diff --git a/swvo/io/kp/swpc.py b/swvo/io/kp/swpc.py index 85a8cd2..66edb7f 100755 --- a/swvo/io/kp/swpc.py +++ b/swvo/io/kp/swpc.py @@ -174,7 +174,7 @@ def read(self, start_time: datetime, end_time: Optional[datetime] = None, downlo freq=timedelta(hours=3), ) data_out = pd.DataFrame(index=t) - data_out.index = enforce_utc_timezone(data_out.index) # ty: ignore[no-matching-overload] + data_out.index = enforce_utc_timezone(data_out.index) data_out["kp"] = np.array([np.nan] * len(t)) data_out["file_name"] = np.array([np.nan] * len(t)) diff --git a/swvo/io/utils.py b/swvo/io/utils.py index f0416be..b59c089 100644 --- a/swvo/io/utils.py +++ b/swvo/io/utils.py @@ -276,7 +276,11 @@ def enforce_utc_timezone(time: pd.Series) -> pd.Series: ... def enforce_utc_timezone(time: pd.DatetimeIndex) -> pd.DatetimeIndex: ... -def enforce_utc_timezone(time: datetime | list[datetime] | pd.Timestamp | pd.Series | pd.DatetimeIndex): +@overload +def enforce_utc_timezone(time: pd.Index) -> pd.Index: ... + + +def enforce_utc_timezone(time: datetime | list[datetime] | pd.Timestamp | pd.Series | pd.DatetimeIndex | pd.Index): """ Ensure datetime object(s) have UTC timezone information. diff --git a/tests/io/kp/data/kp.html b/tests/io/kp/data/kp.html new file mode 100644 index 0000000..1da0018 --- /dev/null +++ b/tests/io/kp/data/kp.html @@ -0,0 +1,261 @@ + + + + +
+ + + +| Date | Kp: Planetary 3Hr Range Index | Sum Kp | ap: Planetary Equiv. Amplitude | Ap | Cp | C9 | ||||||||||||||
| 01-01-25 | 40 | 53 | 50 | 60 | 67 | 80 | 63 | 40 | 453 | 27 | 56 | 48 | 80 | 111 | 207 | 94 | 27 | 81 | 1.8 | 7 |
| 02-01-25 | 33 | 20 | 33 | 47 | 43 | 37 | 33 | 30 | 277 | 18 | 7 | 18 | 39 | 32 | 22 | 18 | 15 | 21 | 1.1 | 5 |
| 03-01-25 | 30 | 33 | 33 | 20 | 7 | 13 | 10 | 17 | 163 | 15 | 18 | 18 | 7 | 3 | 5 | 4 | 6 | 10 | 0.5 | 2 |
| 04-01-25 | 20 | 47 | 50 | 43 | 33 | 43 | 53 | 30 | 320 | 7 | 39 | 48 | 32 | 18 | 32 | 56 | 15 | 31 | 1.3 | 6 |
| 05-01-25 | 33 | 23 | 23 | 33 | 30 | 43 | 40 | 27 | 253 | 18 | 9 | 9 | 18 | 15 | 32 | 27 | 12 | 18 | 1.0 | 5 |
| 06-01-25 | 30 | 37 | 27 | 30 | 27 | 30 | 23 | 23 | 227 | 15 | 22 | 12 | 15 | 12 | 15 | 9 | 9 | 14 | 0.8 | 4 |
| 07-01-25 | 23 | 27 | 23 | 27 | 23 | 33 | 20 | 20 | 197 | 9 | 12 | 9 | 12 | 9 | 18 | 7 | 7 | 10 | 0.6 | 3 |
| 08-01-25 | 37 | 23 | 17 | 0 | 10 | 23 | 10 | 7 | 127 | 22 | 9 | 6 | 0 | 4 | 9 | 4 | 3 | 7 | 0.4 | 2 |
| 09-01-25 | 20 | 10 | 17 | 20 | 37 | 27 | 17 | 13 | 160 | 7 | 4 | 6 | 7 | 22 | 12 | 6 | 5 | 9 | 0.5 | 2 |
| 10-01-25 | 10 | 33 | 20 | 27 | 30 | 13 | 23 | 40 | 197 | 4 | 18 | 7 | 12 | 15 | 5 | 9 | 27 | 12 | 0.7 | 3 |
| 11-01-25 | 23 | 10 | 13 | 10 | 17 | 30 | 13 | 7 | 123 | 9 | 4 | 5 | 4 | 6 | 15 | 5 | 3 | 6 | 0.3 | 1 |
| 12-01-25 | 13 | 17 | 17 | 17 | 23 | 13 | 13 | 17 | 130 | 5 | 6 | 6 | 6 | 9 | 5 | 5 | 6 | 6 | 0.3 | 1 |
| 13-01-25 | 40 | 23 | 10 | 23 | 20 | 23 | 23 | 20 | 183 | 27 | 9 | 4 | 9 | 7 | 9 | 9 | 7 | 10 | 0.6 | 3 |
| 14-01-25 | 27 | 23 | 23 | 20 | 20 | 27 | 30 | 27 | 197 | 12 | 9 | 9 | 7 | 7 | 12 | 15 | 12 | 10 | 0.6 | 3 |
| 15-01-25 | 33 | 37 | 37 | 27 | 30 | 33 | 27 | 17 | 240 | 18 | 22 | 22 | 12 | 15 | 18 | 12 | 6 | 16 | 0.9 | 4 |
| 16-01-25 | 27 | 33 | 17 | 23 | 30 | 30 | 37 | 27 | 223 | 12 | 18 | 6 | 9 | 15 | 15 | 22 | 12 | 14 | 0.8 | 4 |
| 17-01-25 | 33 | 40 | 37 | 30 | 33 | 30 | 40 | 40 | 283 | 18 | 27 | 22 | 15 | 18 | 15 | 27 | 27 | 21 | 1.1 | 5 |
| 18-01-25 | 20 | 20 | 23 | 20 | 23 | 20 | 20 | 20 | 167 | 7 | 7 | 9 | 7 | 9 | 7 | 7 | 7 | 8 | 0.4 | 2 |
| 19-01-25 | 10 | 23 | 37 | 27 | 33 | 30 | 30 | 37 | 227 | 4 | 9 | 22 | 12 | 18 | 15 | 15 | 22 | 15 | 0.8 | 4 |
| 20-01-25 | 37 | 33 | 30 | 30 | 43 | 30 | 27 | 37 | 267 | 22 | 18 | 15 | 15 | 32 | 15 | 12 | 22 | 19 | 1.0 | 5 |
| 21-01-25 | 23 | 30 | 20 | 17 | 10 | 27 | 30 | 23 | 180 | 9 | 15 | 7 | 6 | 4 | 12 | 15 | 9 | 10 | 0.5 | 2 |
| 22-01-25 | 27 | 20 | 30 | 23 | 17 | 13 | 10 | 30 | 170 | 12 | 7 | 15 | 9 | 6 | 5 | 4 | 15 | 9 | 0.5 | 2 |
| 23-01-25 | 33 | 17 | 7 | 10 | 13 | 37 | 37 | 23 | 177 | 18 | 6 | 3 | 4 | 5 | 22 | 22 | 9 | 11 | 0.6 | 3 |
| 24-01-25 | 13 | 13 | 23 | 13 | 20 | 10 | 10 | 23 | 127 | 5 | 5 | 9 | 5 | 7 | 4 | 4 | 9 | 6 | 0.3 | 1 |
| 25-01-25 | 3 | 0 | 3 | 3 | 7 | 10 | 0 | 7 | 33 | 2 | 0 | 2 | 2 | 3 | 4 | 0 | 3 | 2 | 0.0 | 0 |
| 26-01-25 | 3 | 7 | 3 | 3 | 3 | 0 | 0 | 3 | 23 | 2 | 3 | 2 | 2 | 2 | 0 | 0 | 2 | 2 | 0.0 | 0 |
| 27-01-25 | 0 | 0 | 13 | 17 | 13 | 27 | 20 | 20 | 110 | 0 | 0 | 5 | 6 | 5 | 12 | 7 | 7 | 5 | 0.2 | 1 |
| 28-01-25 | 20 | 13 | 7 | 23 | 27 | 27 | 37 | 27 | 180 | 7 | 5 | 3 | 9 | 12 | 12 | 22 | 12 | 10 | 0.6 | 3 |
| 29-01-25 | 20 | 10 | 10 | 20 | 20 | 17 | 13 | 13 | 123 | 7 | 4 | 4 | 7 | 7 | 6 | 5 | 5 | 6 | 0.3 | 1 |
| 30-01-25 | 7 | 7 | 7 | 3 | 7 | 13 | 20 | 27 | 90 | 3 | 3 | 3 | 2 | 3 | 5 | 7 | 12 | 5 | 0.2 | 1 |
| 31-01-25 | 17 | 7 | 10 | 17 | 20 | 17 | 20 | 27 | 133 | 6 | 3 | 4 | 6 | 7 | 6 | 7 | 12 | 6 | 0.3 | 1 |