From 79d878dea30286f7c481fa4305bccc23953b52ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=A3=20Bida=20Vacaro?= Date: Thu, 26 Feb 2026 20:02:55 -0300 Subject: [PATCH 1/6] feat: include a python API to dados.gov.br API --- condarecipe/pysus/meta.yaml | 1 - poetry.lock | 408 ++++++++++++++++++++++++++++++---- pyproject.toml | 5 +- pysus/api/dadosgov/client.py | 53 +++++ pysus/api/dadosgov/models.py | 83 +++++++ pysus/api/dadosgov/schemas.py | 0 pysus/ftp/README.md | 0 pysus/online_data/ESUS.py | 98 -------- pysus/preprocessing/ESUS.py | 69 ------ pysus/tests/test_esus.py | 16 -- setup.cfg | 2 +- 11 files changed, 503 insertions(+), 232 deletions(-) create mode 100644 pysus/api/dadosgov/client.py create mode 100644 pysus/api/dadosgov/models.py create mode 100644 pysus/api/dadosgov/schemas.py create mode 100644 pysus/ftp/README.md delete mode 100644 pysus/online_data/ESUS.py delete mode 100644 pysus/preprocessing/ESUS.py delete mode 100644 pysus/tests/test_esus.py diff --git a/condarecipe/pysus/meta.yaml b/condarecipe/pysus/meta.yaml index 1eeaaef0..41f45164 100644 --- a/condarecipe/pysus/meta.yaml +++ b/condarecipe/pysus/meta.yaml @@ -32,7 +32,6 @@ requirements: - pyarrow - python - requests - - elasticsearch test: imports: diff --git a/poetry.lock b/poetry.lock index 331da2c8..daacd197 100644 --- a/poetry.lock +++ b/poetry.lock @@ -28,13 +28,25 @@ files = [ {file = "alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65"}, ] +[[package]] +name = "annotated-types" +version = "0.7.0" +description = "Reusable constraint types to use with typing.Annotated" +optional = false +python-versions = ">=3.8" +groups = ["main", "dev"] +files = [ + {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"}, + {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, +] + [[package]] name = "anyio" version = "4.6.2.post1" description = "High level compatibility layer for multiple asynchronous event loop implementations" optional = false python-versions = ">=3.9" -groups = ["docs"] +groups = ["dev", "docs"] files = [ {file = "anyio-4.6.2.post1-py3-none-any.whl", hash = "sha256:6d170c36fba3bdd840c73d3868c1e777e33676a69c3a72cf0a0d5d6d8009b61d"}, {file = "anyio-4.6.2.post1.tar.gz", hash = "sha256:4c8bc31ccdb51c7f7bd251f51c609e038d63e34219b44aa86e47576389880b4c"}, @@ -64,6 +76,21 @@ files = [ {file = "appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee"}, ] +[[package]] +name = "argcomplete" +version = "3.6.3" +description = "Bash tab completion for argparse" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "argcomplete-3.6.3-py3-none-any.whl", hash = "sha256:f5007b3a600ccac5d25bbce33089211dfd49eab4a7718da3f10e3082525a92ce"}, + {file = "argcomplete-3.6.3.tar.gz", hash = "sha256:62e8ed4fd6a45864acc8235409461b72c9a28ee785a2011cc5eb78318786c89c"}, +] + +[package.extras] +test = ["coverage", "mypy", "pexpect", "ruff", "wheel"] + [[package]] name = "argon2-cffi" version = "23.1.0" @@ -325,7 +352,7 @@ version = "2024.8.30" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" -groups = ["main", "dev", "docs", "geo"] +groups = ["dev", "docs", "geo"] files = [ {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"}, {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"}, @@ -778,6 +805,40 @@ files = [ docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] tests = ["pytest", "pytest-cov", "pytest-xdist"] +[[package]] +name = "datamodel-code-generator" +version = "0.54.0" +description = "Datamodel Code Generator" +optional = false +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "datamodel_code_generator-0.54.0-py3-none-any.whl", hash = "sha256:3156df7a7e8fa5a7c9a6d50836e5ba5abe0532f6b71eee6d73a0c8e1fb5b7e47"}, + {file = "datamodel_code_generator-0.54.0.tar.gz", hash = "sha256:2b183598d049e265146a8224c35d1bb96a80a641ea8ecd2a82e6a0e97b56da6b"}, +] + +[package.dependencies] +argcomplete = ">=2.10.1,<4" +black = ">=19.10b0" +genson = ">=1.2.1,<2" +httpx = {version = ">=0.24.1", optional = true, markers = "extra == \"http\""} +inflect = ">=4.1,<8" +isort = ">=4.3.21,<8" +jinja2 = ">=2.10.1,<4" +packaging = "*" +pydantic = ">=1.5" +pyyaml = ">=6.0.1" +tomli = {version = ">=2.2.1,<3", markers = "python_version <= \"3.11\""} + +[package.extras] +all = ["graphql-core (>=3.2.3)", "httpx (>=0.24.1)", "openapi-spec-validator (>=0.2.8,<0.8)", "prance (>=0.18.2)", "pysnooper (>=0.4.1,<2)", "ruff (>=0.9.10)", "watchfiles (>=1.1)"] +debug = ["pysnooper (>=0.4.1,<2)"] +graphql = ["graphql-core (>=3.2.3)"] +http = ["httpx (>=0.24.1)"] +ruff = ["ruff (>=0.9.10)"] +validation = ["openapi-spec-validator (>=0.2.8,<0.8)", "prance (>=0.18.2)"] +watch = ["watchfiles (>=1.1)"] + [[package]] name = "dateparser" version = "1.2.0" @@ -913,28 +974,6 @@ files = [ {file = "docutils-0.18.1.tar.gz", hash = "sha256:679987caf361a7539d76e584cbeddc311e3aee937877c87346f31debc63e9d06"}, ] -[[package]] -name = "elasticsearch" -version = "7.16.2" -description = "Python client for Elasticsearch" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" -groups = ["main"] -files = [ - {file = "elasticsearch-7.16.2-py2.py3-none-any.whl", hash = "sha256:c05aa792a52b1e6ad9d226340dc19165c4a491ac48fbd91af51ec839bf953210"}, - {file = "elasticsearch-7.16.2.tar.gz", hash = "sha256:23ac0afb4398c48990e359ac73ab6963741bd05321345299c62d9d23e209eee2"}, -] - -[package.dependencies] -certifi = "*" -urllib3 = ">=1.21.1,<2" - -[package.extras] -async = ["aiohttp (>=3,<4)"] -develop = ["black", "coverage", "jinja2", "mock", "pytest", "pytest-cov", "pyyaml", "requests (>=2.0.0,<3.0.0)", "sphinx (<1.7)", "sphinx-rtd-theme"] -docs = ["sphinx (<1.7)", "sphinx-rtd-theme"] -requests = ["requests (>=2.4.0,<3.0.0)"] - [[package]] name = "exceptiongroup" version = "1.2.2" @@ -1235,6 +1274,18 @@ files = [ {file = "future-1.0.0.tar.gz", hash = "sha256:bd2968309307861edae1458a4f8a4f3598c03be43b97521076aebf5d94c07b05"}, ] +[[package]] +name = "genson" +version = "1.3.0" +description = "GenSON is a powerful, user-friendly JSON Schema generator." +optional = false +python-versions = "*" +groups = ["dev"] +files = [ + {file = "genson-1.3.0-py3-none-any.whl", hash = "sha256:468feccd00274cc7e4c09e84b08704270ba8d95232aa280f65b986139cec67f7"}, + {file = "genson-1.3.0.tar.gz", hash = "sha256:e02db9ac2e3fd29e65b5286f7135762e2cd8a986537c075b06fc5f1517308e37"}, +] + [[package]] name = "geocoder" version = "1.38.1" @@ -1260,7 +1311,7 @@ version = "0.14.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" optional = false python-versions = ">=3.7" -groups = ["docs"] +groups = ["dev", "docs"] files = [ {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, @@ -1272,7 +1323,7 @@ version = "1.0.7" description = "A minimal low-level HTTP client." optional = false python-versions = ">=3.8" -groups = ["docs"] +groups = ["dev", "docs"] files = [ {file = "httpcore-1.0.7-py3-none-any.whl", hash = "sha256:a3fff8f43dc260d5bd363d9f9cf1830fa3a458b332856f34282de498ed420edd"}, {file = "httpcore-1.0.7.tar.gz", hash = "sha256:8551cb62a169ec7162ac7be8d4817d561f60e08eaa485234898414bb5a8a0b4c"}, @@ -1294,7 +1345,7 @@ version = "0.27.2" description = "The next generation HTTP client." optional = false python-versions = ">=3.8" -groups = ["docs"] +groups = ["dev", "docs"] files = [ {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"}, {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"}, @@ -1371,6 +1422,30 @@ files = [ {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"}, ] +[[package]] +name = "inflect" +version = "7.5.0" +description = "Correctly generate plurals, singular nouns, ordinals, indefinite articles" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "inflect-7.5.0-py3-none-any.whl", hash = "sha256:2aea70e5e70c35d8350b8097396ec155ffd68def678c7ff97f51aa69c1d92344"}, + {file = "inflect-7.5.0.tar.gz", hash = "sha256:faf19801c3742ed5a05a8ce388e0d8fe1a07f8d095c82201eb904f5d27ad571f"}, +] + +[package.dependencies] +more_itertools = ">=8.5.0" +typeguard = ">=4.0.1" + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["pygments", "pytest (>=6,!=8.1.*)"] +type = ["pytest-mypy"] + [[package]] name = "iniconfig" version = "2.0.0" @@ -2128,6 +2203,18 @@ files = [ {file = "mistune-3.0.2.tar.gz", hash = "sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8"}, ] +[[package]] +name = "more-itertools" +version = "10.8.0" +description = "More routines for operating on iterables, beyond itertools" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "more_itertools-10.8.0-py3-none-any.whl", hash = "sha256:52d4362373dcf7c52546bc4af9a86ee7c4579df9a8dc268be0a2f949d376cc9b"}, + {file = "more_itertools-10.8.0.tar.gz", hash = "sha256:f638ddf8a1a0d134181275fb5d58b086ead7c6a72429ad725c67503f13ba30bd"}, +] + [[package]] name = "mypy-extensions" version = "1.0.0" @@ -2911,6 +2998,162 @@ files = [ {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] +[[package]] +name = "pydantic" +version = "2.12.5" +description = "Data validation using Python type hints" +optional = false +python-versions = ">=3.9" +groups = ["main", "dev"] +files = [ + {file = "pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d"}, + {file = "pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49"}, +] + +[package.dependencies] +annotated-types = ">=0.6.0" +pydantic-core = "2.41.5" +typing-extensions = ">=4.14.1" +typing-inspection = ">=0.4.2" + +[package.extras] +email = ["email-validator (>=2.0.0)"] +timezone = ["tzdata ; python_version >= \"3.9\" and platform_system == \"Windows\""] + +[[package]] +name = "pydantic-core" +version = "2.41.5" +description = "Core functionality for Pydantic validation and serialization" +optional = false +python-versions = ">=3.9" +groups = ["main", "dev"] +files = [ + {file = "pydantic_core-2.41.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:77b63866ca88d804225eaa4af3e664c5faf3568cea95360d21f4725ab6e07146"}, + {file = "pydantic_core-2.41.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dfa8a0c812ac681395907e71e1274819dec685fec28273a28905df579ef137e2"}, + {file = "pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5921a4d3ca3aee735d9fd163808f5e8dd6c6972101e4adbda9a4667908849b97"}, + {file = "pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25c479382d26a2a41b7ebea1043564a937db462816ea07afa8a44c0866d52f9"}, + {file = "pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f547144f2966e1e16ae626d8ce72b4cfa0caedc7fa28052001c94fb2fcaa1c52"}, + {file = "pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f52298fbd394f9ed112d56f3d11aabd0d5bd27beb3084cc3d8ad069483b8941"}, + {file = "pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:100baa204bb412b74fe285fb0f3a385256dad1d1879f0a5cb1499ed2e83d132a"}, + {file = "pydantic_core-2.41.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:05a2c8852530ad2812cb7914dc61a1125dc4e06252ee98e5638a12da6cc6fb6c"}, + {file = "pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:29452c56df2ed968d18d7e21f4ab0ac55e71dc59524872f6fc57dcf4a3249ed2"}, + {file = "pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:d5160812ea7a8a2ffbe233d8da666880cad0cbaf5d4de74ae15c313213d62556"}, + {file = "pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:df3959765b553b9440adfd3c795617c352154e497a4eaf3752555cfb5da8fc49"}, + {file = "pydantic_core-2.41.5-cp310-cp310-win32.whl", hash = "sha256:1f8d33a7f4d5a7889e60dc39856d76d09333d8a6ed0f5f1190635cbec70ec4ba"}, + {file = "pydantic_core-2.41.5-cp310-cp310-win_amd64.whl", hash = "sha256:62de39db01b8d593e45871af2af9e497295db8d73b085f6bfd0b18c83c70a8f9"}, + {file = "pydantic_core-2.41.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a3a52f6156e73e7ccb0f8cced536adccb7042be67cb45f9562e12b319c119da6"}, + {file = "pydantic_core-2.41.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f3bf998340c6d4b0c9a2f02d6a400e51f123b59565d74dc60d252ce888c260b"}, + {file = "pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:378bec5c66998815d224c9ca994f1e14c0c21cb95d2f52b6021cc0b2a58f2a5a"}, + {file = "pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e7b576130c69225432866fe2f4a469a85a54ade141d96fd396dffcf607b558f8"}, + {file = "pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6cb58b9c66f7e4179a2d5e0f849c48eff5c1fca560994d6eb6543abf955a149e"}, + {file = "pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88942d3a3dff3afc8288c21e565e476fc278902ae4d6d134f1eeda118cc830b1"}, + {file = "pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f31d95a179f8d64d90f6831d71fa93290893a33148d890ba15de25642c5d075b"}, + {file = "pydantic_core-2.41.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c1df3d34aced70add6f867a8cf413e299177e0c22660cc767218373d0779487b"}, + {file = "pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4009935984bd36bd2c774e13f9a09563ce8de4abaa7226f5108262fa3e637284"}, + {file = "pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:34a64bc3441dc1213096a20fe27e8e128bd3ff89921706e83c0b1ac971276594"}, + {file = "pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c9e19dd6e28fdcaa5a1de679aec4141f691023916427ef9bae8584f9c2fb3b0e"}, + {file = "pydantic_core-2.41.5-cp311-cp311-win32.whl", hash = "sha256:2c010c6ded393148374c0f6f0bf89d206bf3217f201faa0635dcd56bd1520f6b"}, + {file = "pydantic_core-2.41.5-cp311-cp311-win_amd64.whl", hash = "sha256:76ee27c6e9c7f16f47db7a94157112a2f3a00e958bc626e2f4ee8bec5c328fbe"}, + {file = "pydantic_core-2.41.5-cp311-cp311-win_arm64.whl", hash = "sha256:4bc36bbc0b7584de96561184ad7f012478987882ebf9f9c389b23f432ea3d90f"}, + {file = "pydantic_core-2.41.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f41a7489d32336dbf2199c8c0a215390a751c5b014c2c1c5366e817202e9cdf7"}, + {file = "pydantic_core-2.41.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:070259a8818988b9a84a449a2a7337c7f430a22acc0859c6b110aa7212a6d9c0"}, + {file = "pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96cea19e34778f8d59fe40775a7a574d95816eb150850a85a7a4c8f4b94ac69"}, + {file = "pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed2e99c456e3fadd05c991f8f437ef902e00eedf34320ba2b0842bd1c3ca3a75"}, + {file = "pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65840751b72fbfd82c3c640cff9284545342a4f1eb1586ad0636955b261b0b05"}, + {file = "pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e536c98a7626a98feb2d3eaf75944ef6f3dbee447e1f841eae16f2f0a72d8ddc"}, + {file = "pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eceb81a8d74f9267ef4081e246ffd6d129da5d87e37a77c9bde550cb04870c1c"}, + {file = "pydantic_core-2.41.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d38548150c39b74aeeb0ce8ee1d8e82696f4a4e16ddc6de7b1d8823f7de4b9b5"}, + {file = "pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c23e27686783f60290e36827f9c626e63154b82b116d7fe9adba1fda36da706c"}, + {file = "pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:482c982f814460eabe1d3bb0adfdc583387bd4691ef00b90575ca0d2b6fe2294"}, + {file = "pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bfea2a5f0b4d8d43adf9d7b8bf019fb46fdd10a2e5cde477fbcb9d1fa08c68e1"}, + {file = "pydantic_core-2.41.5-cp312-cp312-win32.whl", hash = "sha256:b74557b16e390ec12dca509bce9264c3bbd128f8a2c376eaa68003d7f327276d"}, + {file = "pydantic_core-2.41.5-cp312-cp312-win_amd64.whl", hash = "sha256:1962293292865bca8e54702b08a4f26da73adc83dd1fcf26fbc875b35d81c815"}, + {file = "pydantic_core-2.41.5-cp312-cp312-win_arm64.whl", hash = "sha256:1746d4a3d9a794cacae06a5eaaccb4b8643a131d45fbc9af23e353dc0a5ba5c3"}, + {file = "pydantic_core-2.41.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9"}, + {file = "pydantic_core-2.41.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34"}, + {file = "pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0"}, + {file = "pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33"}, + {file = "pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e"}, + {file = "pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2"}, + {file = "pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586"}, + {file = "pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d"}, + {file = "pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740"}, + {file = "pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e"}, + {file = "pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858"}, + {file = "pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36"}, + {file = "pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11"}, + {file = "pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd"}, + {file = "pydantic_core-2.41.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3f37a19d7ebcdd20b96485056ba9e8b304e27d9904d233d7b1015db320e51f0a"}, + {file = "pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14"}, + {file = "pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1"}, + {file = "pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66"}, + {file = "pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869"}, + {file = "pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2"}, + {file = "pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375"}, + {file = "pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553"}, + {file = "pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90"}, + {file = "pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07"}, + {file = "pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb"}, + {file = "pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23"}, + {file = "pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf"}, + {file = "pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008"}, + {file = "pydantic_core-2.41.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:8bfeaf8735be79f225f3fefab7f941c712aaca36f1128c9d7e2352ee1aa87bdf"}, + {file = "pydantic_core-2.41.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:346285d28e4c8017da95144c7f3acd42740d637ff41946af5ce6e5e420502dd5"}, + {file = "pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a75dafbf87d6276ddc5b2bf6fae5254e3d0876b626eb24969a574fff9149ee5d"}, + {file = "pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7b93a4d08587e2b7e7882de461e82b6ed76d9026ce91ca7915e740ecc7855f60"}, + {file = "pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e8465ab91a4bd96d36dde3263f06caa6a8a6019e4113f24dc753d79a8b3a3f82"}, + {file = "pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:299e0a22e7ae2b85c1a57f104538b2656e8ab1873511fd718a1c1c6f149b77b5"}, + {file = "pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:707625ef0983fcfb461acfaf14de2067c5942c6bb0f3b4c99158bed6fedd3cf3"}, + {file = "pydantic_core-2.41.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f41eb9797986d6ebac5e8edff36d5cef9de40def462311b3eb3eeded1431e425"}, + {file = "pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0384e2e1021894b1ff5a786dbf94771e2986ebe2869533874d7e43bc79c6f504"}, + {file = "pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:f0cd744688278965817fd0839c4a4116add48d23890d468bc436f78beb28abf5"}, + {file = "pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:753e230374206729bf0a807954bcc6c150d3743928a73faffee51ac6557a03c3"}, + {file = "pydantic_core-2.41.5-cp39-cp39-win32.whl", hash = "sha256:873e0d5b4fb9b89ef7c2d2a963ea7d02879d9da0da8d9d4933dee8ee86a8b460"}, + {file = "pydantic_core-2.41.5-cp39-cp39-win_amd64.whl", hash = "sha256:e4f4a984405e91527a0d62649ee21138f8e3d0ef103be488c1dc11a80d7f184b"}, + {file = "pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034"}, + {file = "pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c"}, + {file = "pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2"}, + {file = "pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f15489ba13d61f670dcc96772e733aad1a6f9c429cc27574c6cdaed82d0146ad"}, + {file = "pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd"}, + {file = "pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc"}, + {file = "pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56"}, + {file = "pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b5819cd790dbf0c5eb9f82c73c16b39a65dd6dd4d1439dcdea7816ec9adddab8"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5a4e67afbc95fa5c34cf27d9089bca7fcab4e51e57278d710320a70b956d1b9a"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ece5c59f0ce7d001e017643d8d24da587ea1f74f6993467d85ae8a5ef9d4f42b"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:16f80f7abe3351f8ea6858914ddc8c77e02578544a0ebc15b4c2e1a0e813b0b2"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:33cb885e759a705b426baada1fe68cbb0a2e68e34c5d0d0289a364cf01709093"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:c8d8b4eb992936023be7dee581270af5c6e0697a8559895f527f5b7105ecd36a"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:242a206cd0318f95cd21bdacff3fcc3aab23e79bba5cac3db5a841c9ef9c6963"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d3a978c4f57a597908b7e697229d996d77a6d3c94901e9edee593adada95ce1a"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2379fa7ed44ddecb5bfe4e48577d752db9fc10be00a6b7446e9663ba143de26"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:266fb4cbf5e3cbd0b53669a6d1b039c45e3ce651fd5442eff4d07c2cc8d66808"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58133647260ea01e4d0500089a8c4f07bd7aa6ce109682b1426394988d8aaacc"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:287dad91cfb551c363dc62899a80e9e14da1f0e2b6ebde82c806612ca2a13ef1"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:03b77d184b9eb40240ae9fd676ca364ce1085f203e1b1256f8ab9984dca80a84"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:a668ce24de96165bb239160b3d854943128f4334822900534f2fe947930e5770"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f14f8f046c14563f8eb3f45f499cc658ab8d10072961e07225e507adb700e93f"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51"}, + {file = "pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e"}, +] + +[package.dependencies] +typing-extensions = ">=4.14.1" + [[package]] name = "pyflakes" version = "2.5.0" @@ -3659,7 +3902,7 @@ version = "1.3.1" description = "Sniff out which async library your code is running under" optional = false python-versions = ">=3.7" -groups = ["docs"] +groups = ["dev", "docs"] files = [ {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, @@ -3922,16 +4165,61 @@ test = ["pytest", "ruff"] [[package]] name = "tomli" -version = "2.1.0" +version = "2.4.0" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" groups = ["dev", "docs"] files = [ - {file = "tomli-2.1.0-py3-none-any.whl", hash = "sha256:a5c57c3d1c56f5ccdf89f6523458f60ef716e210fc47c4cfb188c5ba473e0391"}, - {file = "tomli-2.1.0.tar.gz", hash = "sha256:3f646cae2aec94e17d04973e4249548320197cfabdf130015d023de4b74d8ab8"}, -] -markers = {dev = "python_version == \"3.10\""} + {file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"}, + {file = "tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9"}, + {file = "tomli-2.4.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95"}, + {file = "tomli-2.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76"}, + {file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d"}, + {file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576"}, + {file = "tomli-2.4.0-cp311-cp311-win32.whl", hash = "sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a"}, + {file = "tomli-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa"}, + {file = "tomli-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614"}, + {file = "tomli-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1"}, + {file = "tomli-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8"}, + {file = "tomli-2.4.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a"}, + {file = "tomli-2.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1"}, + {file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b"}, + {file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51"}, + {file = "tomli-2.4.0-cp312-cp312-win32.whl", hash = "sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729"}, + {file = "tomli-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da"}, + {file = "tomli-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3"}, + {file = "tomli-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0"}, + {file = "tomli-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e"}, + {file = "tomli-2.4.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4"}, + {file = "tomli-2.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e"}, + {file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c"}, + {file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f"}, + {file = "tomli-2.4.0-cp313-cp313-win32.whl", hash = "sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86"}, + {file = "tomli-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87"}, + {file = "tomli-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132"}, + {file = "tomli-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6"}, + {file = "tomli-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc"}, + {file = "tomli-2.4.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66"}, + {file = "tomli-2.4.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d"}, + {file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702"}, + {file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8"}, + {file = "tomli-2.4.0-cp314-cp314-win32.whl", hash = "sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776"}, + {file = "tomli-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475"}, + {file = "tomli-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2"}, + {file = "tomli-2.4.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9"}, + {file = "tomli-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0"}, + {file = "tomli-2.4.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df"}, + {file = "tomli-2.4.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d"}, + {file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f"}, + {file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b"}, + {file = "tomli-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087"}, + {file = "tomli-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd"}, + {file = "tomli-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4"}, + {file = "tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a"}, + {file = "tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c"}, +] +markers = {dev = "python_version < \"3.12\""} [[package]] name = "tornado" @@ -3991,6 +4279,21 @@ files = [ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.2)", "pytest-mock", "pytest-mypy-testing"] +[[package]] +name = "typeguard" +version = "4.5.1" +description = "Run-time type checker for Python" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "typeguard-4.5.1-py3-none-any.whl", hash = "sha256:44d2bf329d49a244110a090b55f5f91aa82d9a9834ebfd30bcc73651e4a8cc40"}, + {file = "typeguard-4.5.1.tar.gz", hash = "sha256:f6f8ecbbc819c9bc749983cc67c02391e16a9b43b8b27f15dc70ed7c4a007274"}, +] + +[package.dependencies] +typing_extensions = ">=4.14.0" + [[package]] name = "types-python-dateutil" version = "2.9.0.20241003" @@ -4005,17 +4308,32 @@ files = [ [[package]] name = "typing-extensions" -version = "4.12.2" -description = "Backported and Experimental Type Hints for Python 3.8+" +version = "4.15.0" +description = "Backported and Experimental Type Hints for Python 3.9+" optional = false -python-versions = ">=3.8" -groups = ["main", "docs"] +python-versions = ">=3.9" +groups = ["main", "dev", "docs"] files = [ - {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, - {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, + {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, + {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, ] markers = {docs = "python_version < \"3.12\""} +[[package]] +name = "typing-inspection" +version = "0.4.2" +description = "Runtime typing introspection tools" +optional = false +python-versions = ">=3.9" +groups = ["main", "dev"] +files = [ + {file = "typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7"}, + {file = "typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464"}, +] + +[package.dependencies] +typing-extensions = ">=4.12.0" + [[package]] name = "tzdata" version = "2024.2" @@ -4048,14 +4366,14 @@ devenv = ["check-manifest", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3) [[package]] name = "unidecode" -version = "1.3.8" +version = "1.4.0" description = "ASCII transliterations of Unicode text" optional = false -python-versions = ">=3.5" +python-versions = ">=3.7" groups = ["main"] files = [ - {file = "Unidecode-1.3.8-py3-none-any.whl", hash = "sha256:d130a61ce6696f8148a3bd8fe779c99adeb4b870584eeb9526584e9aa091fd39"}, - {file = "Unidecode-1.3.8.tar.gz", hash = "sha256:cfdb349d46ed3873ece4586b96aa75258726e2fa8ec21d6f00a591d98806c2f4"}, + {file = "Unidecode-1.4.0-py3-none-any.whl", hash = "sha256:c3c7606c27503ad8d501270406e345ddb480a7b5f38827eafe4fa82a137f0021"}, + {file = "Unidecode-1.4.0.tar.gz", hash = "sha256:ce35985008338b676573023acc382d62c264f307c8f7963733405add37ea2b23"}, ] [[package]] @@ -4079,7 +4397,7 @@ version = "1.26.20" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" -groups = ["main", "dev", "docs", "geo"] +groups = ["dev", "docs", "geo"] files = [ {file = "urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e"}, {file = "urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32"}, @@ -4223,4 +4541,4 @@ preprocessing = [] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.14" -content-hash = "366b0eac64aa0a754cb64d4b487807570e290a57ca811bad2295b3efa5e593a0" +content-hash = "5a9f9bf4dbb0bcce1c501595176b4f03faed1ee0a5c7c9581d366606e7cddb1c" diff --git a/pyproject.toml b/pyproject.toml index cc22b17b..28f231d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,12 +23,12 @@ Unidecode = "^1.3.6" dateparser = "^1.1.8" pandas = "^2.2.2" urwid = "^2.1.2" -elasticsearch = { version = "7.16.2", extras=["preprocessing"] } # FTP bigtree = "^0.12.2" aioftp = "^0.21.4" humanize = "^4.8.0" -typing-extensions = "^4.9.0" +typing-extensions = ">=4.10.0" +pydantic = "^2.12.5" [tool.poetry.group.dev.dependencies] pytest = ">=6.1.0" @@ -39,6 +39,7 @@ pre-commit = "^2.20.0" pytest-timeout = "^2.1.0" nbsphinx = "^0.9.3" pytest-retry = "1.7.0" +datamodel-code-generator = {extras = ["http"], version = "^0.54.0"} [tool.poetry.group.docs.dependencies] sphinx = "^5.1.1" diff --git a/pysus/api/dadosgov/client.py b/pysus/api/dadosgov/client.py new file mode 100644 index 00000000..54b45691 --- /dev/null +++ b/pysus/api/dadosgov/client.py @@ -0,0 +1,53 @@ +import requests +from typing import List, Optional +from pydantic import TypeAdapter +from pysus.api.dadosgov.models import ( + DatasetDetail, + DatasetSummary, +) +from pysus import __version__ + + +class DadosGov: + def __init__(self, token: str): + self.base_url = "https://dados.gov.br/dados/api" + self.session = requests.Session() + self.session.headers.update( + { + "Accept": "application/json", + "User-Agent": f"PySUS/{__version__}", + "chave-api-dados-abertos": token, + } + ) + + def _get(self, endpoint: str, params: Optional[dict] = None): + url = f"{self.base_url}/{endpoint.lstrip('/')}" + response = self.session.get(url, params=params) + response.raise_for_status() + return response.json() + + def list_datasets( + self, + pagina: int = 1, + nome_conjunto: Optional[str] = None, + dados_abertos: Optional[bool] = None, + is_privado: bool = False, + id_organizacao: Optional[str] = None, + ) -> List[DatasetSummary]: + params = { + "pagina": pagina, + "nomeConjuntoDados": nome_conjunto, + "dadosAbertos": dados_abertos, + "isPrivado": is_privado, + "idOrganizacao": id_organizacao, + } + + params = {k: v for k, v in params.items() if v is not None} + + data = self._get("/publico/conjuntos-dados", params=params) + adapter = TypeAdapter(List[DatasetSummary]) + return adapter.validate_python(data) + + def get_dataset(self, id: str) -> DatasetDetail: + data = self._get(f"/publico/conjuntos-dados/{id}") + return DatasetDetail.model_validate(data) diff --git a/pysus/api/dadosgov/models.py b/pysus/api/dadosgov/models.py new file mode 100644 index 00000000..149cb0fb --- /dev/null +++ b/pysus/api/dadosgov/models.py @@ -0,0 +1,83 @@ +from pydantic import BaseModel, Field, BeforeValidator +from datetime import datetime as dt +from typing import Optional, List, Any, Annotated + + +def to_datetime(value: Any) -> Optional[dt]: + if not value or not isinstance(value, str) or "Indisponível" in value: + return None + try: + return dt.strptime(value, "%d/%m/%Y %H:%M:%S") + except ValueError: + try: + return dt.strptime(value, "%d/%m/%Y") + except ValueError: + return None + + +def to_bool(value: Any) -> bool: + if isinstance(value, bool): + return value + return str(value).lower() in ("sim", "true", "1") + + +DateTime = Annotated[Optional[dt], BeforeValidator(to_datetime)] +Bool = Annotated[bool, BeforeValidator(to_bool)] + + +class Tag(BaseModel): + id: str + name: str + display_name: Optional[str] = None + + +class Resource(BaseModel): + id: str + title: str = Field(alias="titulo") + description: str = Field(alias="descricao") + url: str = Field(alias="link") + format: str = Field(alias="formato") + size: int = Field(alias="tamanho") + cataloging_date: DateTime = Field(None, alias="dataCatalogacao") + last_modified: DateTime = Field(None, alias="dataUltimaAtualizacaoArquivo") + download_count: Optional[int] = Field(None, alias="quantidadeDownloads") + file_name: Optional[str] = Field(None, alias="nomeArquivo") + resource_type: Optional[str] = Field(None, alias="tipo") + order_number: Optional[int] = Field(None, alias="numOrdem") + dataset_id: Optional[str] = Field(None, alias="idConjuntoDados") + + +class DatasetDetail(BaseModel): + id: str + title: str = Field(alias="titulo") + slug: str = Field(alias="nome") + organization: str = Field(alias="organizacao") + description: str = Field(alias="descricao") + license: Optional[str] = Field(None, alias="licenca") + maintainer: Optional[str] = Field(None, alias="responsavel") + maintainer_email: Optional[str] = Field(None, alias="emailResponsavel") + frequency: Optional[str] = Field(None, alias="periodicidade") + themes: List[Any] = Field(default_factory=list, alias="temas") + tags: List[Tag] = Field(default_factory=list) + resources: List[Resource] = Field(default_factory=list, alias="recursos") + is_open_data: Bool = Field(alias="dadosAbertos") + is_discontinued: Bool = Field(alias="descontinuado") + is_private: Bool = Field(False, alias="privado") + metadata_updated: DateTime = Field(None, alias="dataUltimaAtualizacaoMetadados") + file_updated: DateTime = Field(None, alias="dataUltimaAtualizacaoArquivo") + cataloging_date: DateTime = Field(None, alias="dataCatalogacao") + visibility: str = Field(alias="visibilidade") + status: Optional[str] = Field(None, alias="atualizado") + seal: Optional[str] = Field(None, alias="selo") + source: Optional[str] = Field(None, alias="origemCadastro") + + +class DatasetSummary(BaseModel): + id: str + title: str + name: str = Field(alias="nome") + organization_name: str = Field(alias="nomeOrganizacao") + is_updated: Bool = Field(alias="isAtualizado") + cataloging_date: DateTime = Field(None, alias="catalogacao") + metadata_modified: DateTime = Field(None, alias="ultimaAlteracaoMetadados") + last_update: DateTime = Field(None, alias="ultimaAtualizacaoDados") diff --git a/pysus/api/dadosgov/schemas.py b/pysus/api/dadosgov/schemas.py new file mode 100644 index 00000000..e69de29b diff --git a/pysus/ftp/README.md b/pysus/ftp/README.md new file mode 100644 index 00000000..e69de29b diff --git a/pysus/online_data/ESUS.py b/pysus/online_data/ESUS.py deleted file mode 100644 index ef2b990e..00000000 --- a/pysus/online_data/ESUS.py +++ /dev/null @@ -1,98 +0,0 @@ -import os -from datetime import date - -import pandas as pd -from elasticsearch import Elasticsearch, helpers -from loguru import logger -from pysus.ftp import CACHEPATH - - -def download(uf, cache=True, checkmemory=True): - """ - Download ESUS data by UF - :param uf: rj, mg, etc - :param cache: if results should be cached on disk - :return: DataFrame if data fits in memory, - other an iterator of chunks of size 1000. - """ - uf = uf.lower() - user = "user-public-notificacoes" - pwd = "Za4qNXdyQNSa9YaA" - today = date.today() - dt = today.strftime("_%d_%m_%Y") - base = f"desc-esus-notifica-estado-{uf}" # desc-notificacoes-esusve- - url = f"https://{user}:{pwd}@elasticsearch-saps.saude.gov.br" # noqa: E231 - out = f"ESUS_{uf}_{dt}.parquet" - - cachefile = os.path.join(CACHEPATH, out) - tempfile = os.path.join(CACHEPATH, f"ESUS_temp_{uf.upper()}.csv.gz") - if os.path.exists(cachefile): - logger.info(f"Local parquet file found at {cachefile}") - df = pd.read_parquet(cachefile) - elif os.path.exists(tempfile): - logger.info(f"Local csv file found at {tempfile}") - df = pd.read_csv(tempfile, chunksize=1000) - else: - fname = fetch(base, uf, url) - size = os.stat(fname).st_size - if size > 50e6 and checkmemory: - print(f"Downloaded data is to large: {size / 1e6} MB compressed.") - print( - "Only loading the first 1000 rows. If your computer has enough" - + " memory, set 'checkmemory' to False" - ) - print(f"The full data is in {fname}") - df = pd.read_csv(fname, chunksize=1000) - else: - df = pd.read_csv(fname, low_memory=False) - print(f"{df.shape[0]} records downloaded.") - os.unlink(fname) - if cache: - df.to_parquet(cachefile) - logger.info(f"Data stored as parquet at {cachefile}") - - return df - - -def fetch(base, uf, url): - UF = uf.upper() - print(f"Reading ESUS data for {UF}") - es = Elasticsearch([url], send_get_body_as="POST") - body = {"query": {"match_all": {}}} - results = helpers.scan(es, query=body, index=base) - # df = pd.DataFrame.from_dict( - # [document['_source'] for document in results] - # ) - - chunker = chunky_fetch(results, 3000) - h = 1 - tempfile = os.path.join(CACHEPATH, f"ESUS_temp_{UF}.csv.gz") - for ch in chunker: - df = pd.DataFrame.from_dict(ch) - df.sintomas = df["sintomas"].str.replace( - ";", - "", - ) # remove os ; - if h: - df.to_csv(tempfile) - h = 0 - else: - df.to_csv(tempfile, mode="a", header=False) - # df = pd.read_csv('temp.csv.gz') - - return tempfile - - -def chunky_fetch(results, chunk_size=3000): - """Fetches data in chunks to preserve memory""" - data = [] - i = 0 - for d in results: - data.append(d["_source"]) - i += 1 - if i == chunk_size: - yield data - data = [] - i = 0 - else: - yield data diff --git a/pysus/preprocessing/ESUS.py b/pysus/preprocessing/ESUS.py deleted file mode 100644 index 110215c6..00000000 --- a/pysus/preprocessing/ESUS.py +++ /dev/null @@ -1,69 +0,0 @@ -import numpy as np -import pandas as pd -from pysus.online_data.ESUS import download - - -def cases_by_age_and_sex(UF, start="2020-03-01", end="2020-08-31"): - """ - Fetches ESUS covid line list and aggregates by age and sex returning these - counts between start and end dates. - :param UF: State code - :param start: Start date - :param end: end date - :return: dataframe - """ - df = download(uf=UF) - - # Transformando as colunas em datetime type - for cname in df: - if cname.startswith("data"): - df[cname] = pd.to_datetime(df[cname], errors="coerce") - - # Eliminando os valores nulos nas colunas com datas importantes - old_size = len(df) - df.dropna( - subset=["dataNotificacao", "dataInicioSintomas", "dataTeste"], - inplace=True, - ) - print( - f"Removed {old_size - len(df)} rows with missing dates of symptoms, " - "notification or testing" - ) - - # Desconsiderando os resultados negativos ou inconclusivos - df = df.loc[ - ~df.resultadoTeste.isin(["Negativo", "Inconclusivo ou Indeterminado"]) - ] - - # Removendo sexo indeterminado - df = df.loc[df.sexo.isin(["Masculino", "Feminino"])] - - # determinando a data dos primeiros sintomas como a data do index - - df["datesint"] = df["dataInicioSintomas"] - df.set_index("datesint", inplace=True) - df.sort_index(inplace=True, ascending=True) - - # vamos limitar a data inicial e a data final considerando apenas a - # primeira onda - - df = df.loc[start:end] - - ini = np.arange(0, 81, 5) - fin = np.arange(5, 86, 5) - fin[-1] = 120 - faixa_etaria = { - f"[{i},{f})": (i, f) for i, f in zip(ini, fin) # noqa: E231 - } - - labels = list(faixa_etaria.keys()) - df["faixa_etaria"] = [ - labels[i - 1] for i in np.digitize(df.idade, bins=ini) - ] - - agreg = ( - df[["sexo", "faixa_etaria"]].groupby(["faixa_etaria", "sexo"]).size() - ) - agreg = agreg.reset_index() - agreg.columns = ["faixa_etaria", "sexo", "n"] - return agreg diff --git a/pysus/tests/test_esus.py b/pysus/tests/test_esus.py deleted file mode 100644 index 68f159bc..00000000 --- a/pysus/tests/test_esus.py +++ /dev/null @@ -1,16 +0,0 @@ -import unittest - -import pytest -from pysus.online_data.ESUS import download - - -class MyTestCase(unittest.TestCase): - @pytest.mark.skip(reason="This test takes too long") - @pytest.mark.timeout(5) - def test_download(self): - df = download(uf="se") - self.assertGreater(len(df), 0) - - -if __name__ == "__main__": - unittest.main() diff --git a/setup.cfg b/setup.cfg index 157d9dfb..f4ccdd89 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,7 +18,7 @@ max-line-length = 79 ignore = D202,D203,W503,E203 [isort] -known_third_party = dbfread,elasticsearch,geobr,geocoder,numpy,pandas,pyarrow,pyreaddbc,requests,tqdm,urllib3 +known_third_party = dbfread,geobr,geocoder,numpy,pandas,pyarrow,pyreaddbc,requests,tqdm,urllib3 ensure_newline_before_comments=true line_length = 79 multi_line_output = 3 From a20f08003473b33db5ee51d940395e1ab6859424 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=A3=20Bida=20Vacaro?= Date: Tue, 3 Mar 2026 12:23:22 -0300 Subject: [PATCH 2/6] refactor: refactor pysus structure BREAKING CHANGE: version 2.0.0 --- poetry.lock | 56 +++++++- pyproject.toml | 1 + pysus/__init__.py | 3 +- pysus/{ftp => api}/README.md | 0 pysus/api/dadosgov/{schemas.py => README.md} | 0 pysus/{utilities => api/dadosgov}/__init__.py | 0 pysus/api/ducklake/README.md | 0 pysus/api/ducklake/__init__.py | 0 pysus/api/ducklake/catalog/models.py | 0 pysus/api/ducklake/client.py | 0 pysus/api/ftp/README.md | 0 pysus/api/ftp/__init__.py | 3 + pysus/{ftp/__init__.py => api/ftp/client.py} | 33 ++--- pysus/{ => api}/ftp/databases/__init__.py | 0 pysus/{ => api}/ftp/databases/ciha.py | 13 +- pysus/{ => api}/ftp/databases/cnes.py | 12 +- pysus/{ => api}/ftp/databases/ibge_datasus.py | 8 +- pysus/{ => api}/ftp/databases/pni.py | 18 +-- pysus/{ => api}/ftp/databases/sia.py | 13 +- pysus/{ => api}/ftp/databases/sih.py | 13 +- pysus/{ => api}/ftp/databases/sim.py | 4 +- pysus/{ => api}/ftp/databases/sinan.py | 13 +- pysus/{ => api}/ftp/databases/sinasc.py | 4 +- pysus/data/local.py | 12 +- pysus/{ => data}/metadata/SINAN/ANIM.tar.gz | Bin pysus/{ => data}/metadata/SINAN/BOTU.tar.gz | Bin pysus/{ => data}/metadata/SINAN/CHAG.tar.gz | Bin pysus/{ => data}/metadata/SINAN/CHIK.tar.gz | Bin pysus/{ => data}/metadata/SINAN/COLE.tar.gz | Bin pysus/{ => data}/metadata/SINAN/COQU.tar.gz | Bin pysus/{ => data}/metadata/SINAN/DENG.tar.gz | Bin pysus/{ => data}/metadata/SINAN/DIFT.tar.gz | Bin pysus/{ => data}/metadata/SINAN/ESQU.tar.gz | Bin pysus/{ => data}/metadata/SINAN/FAMA.tar.gz | Bin pysus/{ => data}/metadata/SINAN/FMAC.tar.gz | Bin pysus/{ => data}/metadata/SINAN/FTIF.tar.gz | Bin pysus/{ => data}/metadata/SINAN/HANS.tar.gz | Bin pysus/{ => data}/metadata/SINAN/HANT.tar.gz | Bin pysus/{ => data}/metadata/SINAN/HEPA.tar.gz | Bin pysus/{ => data}/metadata/SINAN/IEXO.tar.gz | Bin pysus/{ => data}/metadata/SINAN/LEIV.tar.gz | Bin pysus/{ => data}/metadata/SINAN/LEPT.tar.gz | Bin pysus/{ => data}/metadata/SINAN/LTAN.tar.gz | Bin pysus/{ => data}/metadata/SINAN/MALA.tar.gz | Bin pysus/{ => data}/metadata/SINAN/MENI.tar.gz | Bin pysus/{ => data}/metadata/SINAN/PEST.tar.gz | Bin pysus/{ => data}/metadata/SINAN/RAIV.tar.gz | Bin pysus/{ => data}/metadata/SINAN/SIFC.tar.gz | Bin pysus/{ => data}/metadata/SINAN/SIFG.tar.gz | Bin pysus/{ => data}/metadata/SINAN/TETA.tar.gz | Bin pysus/{ => data}/metadata/SINAN/TETN.tar.gz | Bin pysus/{ => data}/metadata/SINAN/TUBE.tar.gz | Bin pysus/{ => data}/metadata/SINAN/typecast.py | 0 pysus/data/metadata/__init__.py | 0 pysus/{ => data}/preprocessing/SIM.py | 0 pysus/{ => data}/preprocessing/__init__.py | 0 pysus/{online_data => data/remote}/CIHA.py | 8 +- pysus/{online_data => data/remote}/CNES.py | 5 +- pysus/{online_data => data/remote}/IBGE.py | 17 +-- .../remote}/Infodengue.py | 0 pysus/{online_data => data/remote}/PNI.py | 6 +- pysus/{online_data => data/remote}/SIA.py | 6 +- pysus/{online_data => data/remote}/SIH.py | 6 +- pysus/{online_data => data/remote}/SIM.py | 51 +++---- pysus/{online_data => data/remote}/SINAN.py | 9 +- pysus/{online_data => data/remote}/SINASC.py | 6 +- .../{online_data => data/remote}/__init__.py | 0 .../{online_data => data/remote}/territory.py | 2 +- pysus/{online_data => data/remote}/vaccine.py | 6 +- pysus/ftp/utils.py | 28 ---- pysus/online_data/Infogripe.py | 23 ---- pysus/preprocessing/sinan.py | 127 ------------------ pysus/utils/__init__.py | 25 ++++ pysus/{utilities => utils}/brasil.py | 23 ++++ pysus/{preprocessing => utils}/decoders.py | 27 ++-- .../{dataset => utils}/geocode_by_cities.json | 0 pysus/{utilities => utils}/municipios.json | 0 77 files changed, 233 insertions(+), 348 deletions(-) rename pysus/{ftp => api}/README.md (100%) rename pysus/api/dadosgov/{schemas.py => README.md} (100%) rename pysus/{utilities => api/dadosgov}/__init__.py (100%) create mode 100644 pysus/api/ducklake/README.md create mode 100644 pysus/api/ducklake/__init__.py create mode 100644 pysus/api/ducklake/catalog/models.py create mode 100644 pysus/api/ducklake/client.py create mode 100644 pysus/api/ftp/README.md create mode 100644 pysus/api/ftp/__init__.py rename pysus/{ftp/__init__.py => api/ftp/client.py} (95%) rename pysus/{ => api}/ftp/databases/__init__.py (100%) rename pysus/{ => api}/ftp/databases/ciha.py (91%) rename pysus/{ => api}/ftp/databases/cnes.py (92%) rename pysus/{ => api}/ftp/databases/ibge_datasus.py (92%) rename pysus/{ => api}/ftp/databases/pni.py (87%) rename pysus/{ => api}/ftp/databases/sia.py (92%) rename pysus/{ => api}/ftp/databases/sih.py (91%) rename pysus/{ => api}/ftp/databases/sim.py (94%) rename pysus/{ => api}/ftp/databases/sinan.py (94%) rename pysus/{ => api}/ftp/databases/sinasc.py (95%) rename pysus/{ => data}/metadata/SINAN/ANIM.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/BOTU.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/CHAG.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/CHIK.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/COLE.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/COQU.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/DENG.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/DIFT.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/ESQU.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/FAMA.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/FMAC.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/FTIF.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/HANS.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/HANT.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/HEPA.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/IEXO.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/LEIV.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/LEPT.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/LTAN.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/MALA.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/MENI.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/PEST.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/RAIV.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/SIFC.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/SIFG.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/TETA.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/TETN.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/TUBE.tar.gz (100%) rename pysus/{ => data}/metadata/SINAN/typecast.py (100%) create mode 100644 pysus/data/metadata/__init__.py rename pysus/{ => data}/preprocessing/SIM.py (100%) rename pysus/{ => data}/preprocessing/__init__.py (100%) rename pysus/{online_data => data/remote}/CIHA.py (90%) rename pysus/{online_data => data/remote}/CNES.py (96%) rename pysus/{online_data => data/remote}/IBGE.py (97%) rename pysus/{online_data => data/remote}/Infodengue.py (100%) rename pysus/{online_data => data/remote}/PNI.py (93%) rename pysus/{online_data => data/remote}/SIA.py (96%) rename pysus/{online_data => data/remote}/SIH.py (94%) rename pysus/{online_data => data/remote}/SIM.py (91%) rename pysus/{online_data => data/remote}/SINAN.py (89%) rename pysus/{online_data => data/remote}/SINASC.py (92%) rename pysus/{online_data => data/remote}/__init__.py (100%) rename pysus/{online_data => data/remote}/territory.py (92%) rename pysus/{online_data => data/remote}/vaccine.py (98%) delete mode 100644 pysus/ftp/utils.py delete mode 100644 pysus/online_data/Infogripe.py delete mode 100644 pysus/preprocessing/sinan.py create mode 100644 pysus/utils/__init__.py rename pysus/{utilities => utils}/brasil.py (70%) rename pysus/{preprocessing => utils}/decoders.py (94%) rename pysus/{dataset => utils}/geocode_by_cities.json (100%) rename pysus/{utilities => utils}/municipios.json (100%) diff --git a/poetry.lock b/poetry.lock index daacd197..3454fabd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -974,6 +974,60 @@ files = [ {file = "docutils-0.18.1.tar.gz", hash = "sha256:679987caf361a7539d76e584cbeddc311e3aee937877c87346f31debc63e9d06"}, ] +[[package]] +name = "duckdb" +version = "1.4.4" +description = "DuckDB in-process database" +optional = false +python-versions = ">=3.9.0" +groups = ["main"] +files = [ + {file = "duckdb-1.4.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e870a441cb1c41d556205deb665749f26347ed13b3a247b53714f5d589596977"}, + {file = "duckdb-1.4.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:49123b579e4a6323e65139210cd72dddc593a72d840211556b60f9703bda8526"}, + {file = "duckdb-1.4.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5e1933fac5293fea5926b0ee75a55b8cfe7f516d867310a5b251831ab61fe62b"}, + {file = "duckdb-1.4.4-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:707530f6637e91dc4b8125260595299ec9dd157c09f5d16c4186c5988bfbd09a"}, + {file = "duckdb-1.4.4-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:453b115f4777467f35103d8081770ac2f223fb5799178db5b06186e3ab51d1f2"}, + {file = "duckdb-1.4.4-cp310-cp310-win_amd64.whl", hash = "sha256:a3c8542db7ffb128aceb7f3b35502ebaddcd4f73f1227569306cc34bad06680c"}, + {file = "duckdb-1.4.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5ba684f498d4e924c7e8f30dd157da8da34c8479746c5011b6c0e037e9c60ad2"}, + {file = "duckdb-1.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5536eb952a8aa6ae56469362e344d4e6403cc945a80bc8c5c2ebdd85d85eb64b"}, + {file = "duckdb-1.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:47dd4162da6a2be59a0aef640eb08d6360df1cf83c317dcc127836daaf3b7f7c"}, + {file = "duckdb-1.4.4-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6cb357cfa3403910e79e2eb46c8e445bb1ee2fd62e9e9588c6b999df4256abc1"}, + {file = "duckdb-1.4.4-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c25d5b0febda02b7944e94fdae95aecf952797afc8cb920f677b46a7c251955"}, + {file = "duckdb-1.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:6703dd1bb650025b3771552333d305d62ddd7ff182de121483d4e042ea6e2e00"}, + {file = "duckdb-1.4.4-cp311-cp311-win_arm64.whl", hash = "sha256:bf138201f56e5d6fc276a25138341b3523e2f84733613fc43f02c54465619a95"}, + {file = "duckdb-1.4.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ddcfd9c6ff234da603a1edd5fd8ae6107f4d042f74951b65f91bc5e2643856b3"}, + {file = "duckdb-1.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6792ca647216bd5c4ff16396e4591cfa9b4a72e5ad7cdd312cec6d67e8431a7c"}, + {file = "duckdb-1.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1f8d55843cc940e36261689054f7dfb6ce35b1f5b0953b0d355b6adb654b0d52"}, + {file = "duckdb-1.4.4-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c65d15c440c31e06baaebfd2c06d71ce877e132779d309f1edf0a85d23c07e92"}, + {file = "duckdb-1.4.4-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b297eff642503fd435a9de5a9cb7db4eccb6f61d61a55b30d2636023f149855f"}, + {file = "duckdb-1.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:d525de5f282b03aa8be6db86b1abffdceae5f1055113a03d5b50cd2fb8cf2ef8"}, + {file = "duckdb-1.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:50f2eb173c573811b44aba51176da7a4e5c487113982be6a6a1c37337ec5fa57"}, + {file = "duckdb-1.4.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:337f8b24e89bc2e12dadcfe87b4eb1c00fd920f68ab07bc9b70960d6523b8bc3"}, + {file = "duckdb-1.4.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0509b39ea7af8cff0198a99d206dca753c62844adab54e545984c2e2c1381616"}, + {file = "duckdb-1.4.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fb94de6d023de9d79b7edc1ae07ee1d0b4f5fa8a9dcec799650b5befdf7aafec"}, + {file = "duckdb-1.4.4-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0d636ceda422e7babd5e2f7275f6a0d1a3405e6a01873f00d38b72118d30c10b"}, + {file = "duckdb-1.4.4-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7df7351328ffb812a4a289732f500d621e7de9942a3a2c9b6d4afcf4c0e72526"}, + {file = "duckdb-1.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:6fb1225a9ea5877421481d59a6c556a9532c32c16c7ae6ca8d127e2b878c9389"}, + {file = "duckdb-1.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:f28a18cc790217e5b347bb91b2cab27aafc557c58d3d8382e04b4fe55d0c3f66"}, + {file = "duckdb-1.4.4-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:25874f8b1355e96178079e37312c3ba6d61a2354f51319dae860cf21335c3a20"}, + {file = "duckdb-1.4.4-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:452c5b5d6c349dc5d1154eb2062ee547296fcbd0c20e9df1ed00b5e1809089da"}, + {file = "duckdb-1.4.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8e5c2d8a0452df55e092959c0bfc8ab8897ac3ea0f754cb3b0ab3e165cd79aff"}, + {file = "duckdb-1.4.4-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1af6e76fe8bd24875dc56dd8e38300d64dc708cd2e772f67b9fbc635cc3066a3"}, + {file = "duckdb-1.4.4-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d0440f59e0cd9936a9ebfcf7a13312eda480c79214ffed3878d75947fc3b7d6d"}, + {file = "duckdb-1.4.4-cp314-cp314-win_amd64.whl", hash = "sha256:59c8d76016dde854beab844935b1ec31de358d4053e792988108e995b18c08e7"}, + {file = "duckdb-1.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:53cd6423136ab44383ec9955aefe7599b3fb3dd1fe006161e6396d8167e0e0d4"}, + {file = "duckdb-1.4.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8097201bc5fd0779d7fcc2f3f4736c349197235f4cb7171622936343a1aa8dbf"}, + {file = "duckdb-1.4.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cd1be3d48577f5b40eb9706c6b2ae10edfe18e78eb28e31a3b922dcff1183597"}, + {file = "duckdb-1.4.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e041f2fbd6888da090eca96ac167a7eb62d02f778385dd9155ed859f1c6b6dc8"}, + {file = "duckdb-1.4.4-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7eec0bf271ac622e57b7f6554a27a6e7d1dd2f43d1871f7962c74bcbbede15ba"}, + {file = "duckdb-1.4.4-cp39-cp39-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5cdc4126ec925edf3112bc656ac9ed23745294b854935fa7a643a216e4455af6"}, + {file = "duckdb-1.4.4-cp39-cp39-win_amd64.whl", hash = "sha256:c9566a4ed834ec7999db5849f53da0a7ee83d86830c33f471bf0211a1148ca12"}, + {file = "duckdb-1.4.4.tar.gz", hash = "sha256:8bba52fd2acb67668a4615ee17ee51814124223de836d9e2fdcbc4c9021b3d3c"}, +] + +[package.extras] +all = ["adbc-driver-manager", "fsspec", "ipython", "numpy", "pandas", "pyarrow"] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -4541,4 +4595,4 @@ preprocessing = [] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.14" -content-hash = "5a9f9bf4dbb0bcce1c501595176b4f03faed1ee0a5c7c9581d366606e7cddb1c" +content-hash = "4b551ecb1dddda94c2ea6579463188e6aa0ab5da486b63dd8bed941ce9a4d7db" diff --git a/pyproject.toml b/pyproject.toml index 28f231d9..781738c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ aioftp = "^0.21.4" humanize = "^4.8.0" typing-extensions = ">=4.10.0" pydantic = "^2.12.5" +duckdb = "^1.4.4" [tool.poetry.group.dev.dependencies] pytest = ">=6.1.0" diff --git a/pysus/__init__.py b/pysus/__init__.py index 19a54a36..1cfef31f 100644 --- a/pysus/__init__.py +++ b/pysus/__init__.py @@ -3,8 +3,7 @@ from importlib import metadata as importlib_metadata -from pysus.ftp.databases import * # noqa -from pysus.ftp.databases import AVAILABLE_DATABASES +from pysus.api.ftp.databases import * # noqa def get_version() -> str: diff --git a/pysus/ftp/README.md b/pysus/api/README.md similarity index 100% rename from pysus/ftp/README.md rename to pysus/api/README.md diff --git a/pysus/api/dadosgov/schemas.py b/pysus/api/dadosgov/README.md similarity index 100% rename from pysus/api/dadosgov/schemas.py rename to pysus/api/dadosgov/README.md diff --git a/pysus/utilities/__init__.py b/pysus/api/dadosgov/__init__.py similarity index 100% rename from pysus/utilities/__init__.py rename to pysus/api/dadosgov/__init__.py diff --git a/pysus/api/ducklake/README.md b/pysus/api/ducklake/README.md new file mode 100644 index 00000000..e69de29b diff --git a/pysus/api/ducklake/__init__.py b/pysus/api/ducklake/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pysus/api/ducklake/catalog/models.py b/pysus/api/ducklake/catalog/models.py new file mode 100644 index 00000000..e69de29b diff --git a/pysus/api/ducklake/client.py b/pysus/api/ducklake/client.py new file mode 100644 index 00000000..e69de29b diff --git a/pysus/api/ftp/README.md b/pysus/api/ftp/README.md new file mode 100644 index 00000000..e69de29b diff --git a/pysus/api/ftp/__init__.py b/pysus/api/ftp/__init__.py new file mode 100644 index 00000000..65944e50 --- /dev/null +++ b/pysus/api/ftp/__init__.py @@ -0,0 +1,3 @@ +from .client import * # noqa +from .databases import * # noqa + diff --git a/pysus/ftp/__init__.py b/pysus/api/ftp/client.py similarity index 95% rename from pysus/ftp/__init__.py rename to pysus/api/ftp/client.py index 93d0dd01..453ed3f4 100644 --- a/pysus/ftp/__init__.py +++ b/pysus/api/ftp/client.py @@ -1,5 +1,7 @@ from __future__ import annotations +__all__ = ["File", "Directory", "Database", "CACHEPATH"] + import asyncio import os import pathlib @@ -14,7 +16,6 @@ Protocol, Tuple, TypedDict, - TypeVar, Union, runtime_checkable, ) @@ -23,13 +24,13 @@ from aioftp import Client from loguru import logger from pysus.data.local import Data +from pysus.utils import to_list from tqdm import tqdm from typing_extensions import Self # Type aliases PathLike = Union[str, pathlib.Path] FileContent = Dict[str, Union["Directory", "File"]] -T = TypeVar("T") # Constants CACHEPATH: Final[str] = os.getenv( @@ -39,13 +40,6 @@ __cachepath__.mkdir(exist_ok=True) -def to_list(item: Union[T, List[T], Tuple[T, ...], None]) -> List[T]: - """Parse any builtin data type into a list""" - if item is None: - return [] - return [item] if not isinstance(item, (list, tuple)) else list(item) - - # Cache storage DIRECTORY_CACHE: Dict[str, "Directory"] = {} @@ -399,17 +393,13 @@ def load_directory_content(path: str) -> FileContent: def line_parser(line: str): if "" in line: date, time, _, name = line.strip().split(maxsplit=3) - modify = datetime.strptime( - f"{date} {time}", "%m-%d-%y %I:%M%p" - ) + modify = datetime.strptime(f"{date} {time}", "%m-%d-%y %I:%M%p") info = {"size": 0, "type": "dir", "modify": modify} xpath = f"{path}/{name}" content[name] = Directory(xpath) else: date, time, size, name = line.strip().split(maxsplit=3) - modify = datetime.strptime( - f"{date} {time}", "%m-%d-%y %I:%M%p" - ) + modify = datetime.strptime(f"{date} {time}", "%m-%d-%y %I:%M%p") info: FileInfo = { "size": size, "type": "file", @@ -471,7 +461,7 @@ def __init__(self) -> None: self.__content__ = {} def __repr__(self) -> str: - return f'{self.name} - {self.metadata["long_name"]}' + return f"{self.name} - {self.metadata['long_name']}" @property def content(self) -> List[Union[Directory, File]]: @@ -482,8 +472,7 @@ def content(self) -> List[Union[Directory, File]]: """ if not self.__content__: logger.info( - "content is not loaded, use `load()` to load default paths" - ) + "content is not loaded, use `load()` to load default paths") return [] return sorted(list(self.__content__.values()), key=str) @@ -548,9 +537,7 @@ def get_files(self, *args, **kwargs) -> list[File]: """ ... - def download( - self, files: List[File], local_dir: str = CACHEPATH - ) -> List[str]: + def download(self, files: List[File], local_dir: str = CACHEPATH) -> List[str]: """ Downloads a list of Files. """ @@ -565,9 +552,7 @@ def download( return dfiles[0] return dfiles - async def async_download( - self, files: List[File], local_dir: str = CACHEPATH - ): + async def async_download(self, files: List[File], local_dir: str = CACHEPATH): """ Asynchronously downloads a list of files """ diff --git a/pysus/ftp/databases/__init__.py b/pysus/api/ftp/databases/__init__.py similarity index 100% rename from pysus/ftp/databases/__init__.py rename to pysus/api/ftp/databases/__init__.py diff --git a/pysus/ftp/databases/ciha.py b/pysus/api/ftp/databases/ciha.py similarity index 91% rename from pysus/ftp/databases/ciha.py rename to pysus/api/ftp/databases/ciha.py index 5c8c43c4..b84d18ab 100644 --- a/pysus/ftp/databases/ciha.py +++ b/pysus/api/ftp/databases/ciha.py @@ -2,8 +2,8 @@ from typing import List, Optional, Union -from pysus.ftp import Database, Directory, File -from pysus.ftp.utils import MONTHS, UFs, parse_UFs, to_list, zfill_year +from pysus.api.ftp import Database, Directory, File +from pysus.utils import MONTHS, UFs, parse_UFs, to_list, zfill_year class CIHA(Database): @@ -74,17 +74,16 @@ def get_files( group: Union[List[str], str] = "CIHA", ) -> List[File]: files = list( - filter( - lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files - ) + filter(lambda f: f.extension.upper() + in [".DBC", ".DBF"], self.files) ) groups = [gr.upper() for gr in to_list(group)] if not all(gr in list(self.groups) for gr in groups): raise ValueError( - "Unknown CIHA Group(s): " - f"{set(groups).difference(list(self.groups))}" + f"Unknown CIHA Group(s): {set( + groups).difference(list(self.groups))}" ) files = list(filter(lambda f: self.format(f)[0] in groups, files)) diff --git a/pysus/ftp/databases/cnes.py b/pysus/api/ftp/databases/cnes.py similarity index 92% rename from pysus/ftp/databases/cnes.py rename to pysus/api/ftp/databases/cnes.py index 1e070be7..61235fba 100644 --- a/pysus/ftp/databases/cnes.py +++ b/pysus/api/ftp/databases/cnes.py @@ -2,8 +2,8 @@ from typing import List, Optional, Union -from pysus.ftp import Database, Directory, File -from pysus.ftp.utils import MONTHS, UFs, parse_UFs, to_list, zfill_year +from pysus.api.ftp import Database, Directory, File +from pysus.utils import MONTHS, UFs, parse_UFs, to_list, zfill_year class CNES(Database): @@ -55,12 +55,10 @@ def load( if groups: groups = to_list(groups) - if not all( - group in self.groups for group in [gr.upper() for gr in groups] - ): + if not all(group in self.groups for group in [gr.upper() for gr in groups]): raise ValueError( - "Unknown CNES group(s): " - f"{set(groups).difference(self.groups)}" + f"Unknown CNES group(s): {set( + groups).difference(self.groups)}" ) for group in groups: diff --git a/pysus/ftp/databases/ibge_datasus.py b/pysus/api/ftp/databases/ibge_datasus.py similarity index 92% rename from pysus/ftp/databases/ibge_datasus.py rename to pysus/api/ftp/databases/ibge_datasus.py index d1547ae5..39fa6c02 100644 --- a/pysus/ftp/databases/ibge_datasus.py +++ b/pysus/api/ftp/databases/ibge_datasus.py @@ -2,8 +2,8 @@ from typing import List, Literal, Optional, Union -from pysus.ftp import Database, Directory, File -from pysus.ftp.utils import zfill_year +from pysus.api.ftp import Database, Directory, File +from pysus.utils import zfill_year class IBGEDATASUS(Database): @@ -73,9 +73,7 @@ def get_files( if year: if isinstance(year, (str, int)): files = [ - f - for f in files - if self.describe(f)["year"] == zfill_year(year) + f for f in files if self.describe(f)["year"] == zfill_year(year) ] elif isinstance(year, list): files = [ diff --git a/pysus/ftp/databases/pni.py b/pysus/api/ftp/databases/pni.py similarity index 87% rename from pysus/ftp/databases/pni.py rename to pysus/api/ftp/databases/pni.py index 37cf8484..ef154287 100644 --- a/pysus/ftp/databases/pni.py +++ b/pysus/api/ftp/databases/pni.py @@ -2,17 +2,15 @@ from typing import List, Literal, Optional, Union -from pysus.ftp import Database, Directory, File -from pysus.ftp.utils import UFs, parse_UFs, to_list, zfill_year +from pysus.api.ftp import Database, Directory, File +from pysus.utils import UFs, parse_UFs, to_list, zfill_year class PNI(Database): name = "PNI" paths = (Directory("/dissemin/publicos/PNI/DADOS"),) metadata = { - "long_name": ( - "Sistema de Informações do Programa Nacional de Imunizações" - ), + "long_name": ("Sistema de Informações do Programa Nacional de Imunizações"), "source": ( "https://datasus.saude.gov.br/acesso-a-informacao/morbidade-hospitalar-do-sus-sih-sus/", # noqa "https://datasus.saude.gov.br/acesso-a-informacao/producao-hospitalar-sih-sus/", # noqa @@ -58,7 +56,6 @@ def describe(self, file: File) -> dict: return {} def format(self, file: File) -> tuple: - if len(file.name) != 8: raise ValueError(f"Can't format {file.name}") @@ -73,17 +70,16 @@ def get_files( year: Optional[Union[list, str, int]] = None, ) -> List[File]: files = list( - filter( - lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files - ) + filter(lambda f: f.extension.upper() + in [".DBC", ".DBF"], self.files) ) groups = [gr.upper() for gr in to_list(group)] if not all(gr in list(self.groups) for gr in groups): raise ValueError( - "Unknown PNI Group(s): " - f"{set(groups).difference(list(self.groups))}" + f"Unknown PNI Group(s): {set( + groups).difference(list(self.groups))}" ) files = list(filter(lambda f: self.format(f)[0] in groups, files)) diff --git a/pysus/ftp/databases/sia.py b/pysus/api/ftp/databases/sia.py similarity index 92% rename from pysus/ftp/databases/sia.py rename to pysus/api/ftp/databases/sia.py index 76b5dd7b..3f28d809 100644 --- a/pysus/ftp/databases/sia.py +++ b/pysus/api/ftp/databases/sia.py @@ -2,8 +2,8 @@ from typing import List, Optional, Union -from pysus.ftp import Database, Directory, File -from pysus.ftp.utils import MONTHS, UFs, parse_UFs, to_list, zfill_year +from pysus.api.ftp import Database, Directory, File +from pysus.utils import MONTHS, UFs, parse_UFs, to_list, zfill_year class SIA(Database): @@ -93,17 +93,16 @@ def get_files( month: Optional[Union[list, str, int]] = None, ) -> List[File]: files = list( - filter( - lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files - ) + filter(lambda f: f.extension.upper() + in [".DBC", ".DBF"], self.files) ) groups = [gr.upper() for gr in to_list(group)] if not all(gr in list(self.groups) for gr in groups): raise ValueError( - "Unknown SIA Group(s): " - f"{set(groups).difference(list(self.groups))}" + f"Unknown SIA Group(s): {set( + groups).difference(list(self.groups))}" ) files = list(filter(lambda f: self.format(f)[0] in groups, files)) diff --git a/pysus/ftp/databases/sih.py b/pysus/api/ftp/databases/sih.py similarity index 91% rename from pysus/ftp/databases/sih.py rename to pysus/api/ftp/databases/sih.py index 97757d8c..0c28400d 100644 --- a/pysus/ftp/databases/sih.py +++ b/pysus/api/ftp/databases/sih.py @@ -2,8 +2,8 @@ from typing import List, Optional, Union -from pysus.ftp import Database, Directory, File -from pysus.ftp.utils import MONTHS, UFs, parse_UFs, to_list, zfill_year +from pysus.api.ftp import Database, Directory, File +from pysus.utils import MONTHS, UFs, parse_UFs, to_list, zfill_year class SIH(Database): @@ -76,17 +76,16 @@ def get_files( month: Optional[Union[list, str, int]] = None, ) -> List[File]: files = list( - filter( - lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files - ) + filter(lambda f: f.extension.upper() + in [".DBC", ".DBF"], self.files) ) groups = [gr.upper() for gr in to_list(group)] if not all(gr in list(self.groups) for gr in groups): raise ValueError( - f"Unknown SIH Group(s): " - f"{set(groups).difference(list(self.groups))}" + f"Unknown SIH Group(s): {set( + groups).difference(list(self.groups))}" ) files = list(filter(lambda f: self.format(f)[0] in groups, files)) diff --git a/pysus/ftp/databases/sim.py b/pysus/api/ftp/databases/sim.py similarity index 94% rename from pysus/ftp/databases/sim.py rename to pysus/api/ftp/databases/sim.py index 83134a49..0a85aa1f 100644 --- a/pysus/ftp/databases/sim.py +++ b/pysus/api/ftp/databases/sim.py @@ -2,8 +2,8 @@ from typing import List, Optional, Union -from pysus.ftp import Database, Directory, File -from pysus.ftp.utils import UFs, parse_UFs, to_list, zfill_year +from pysus.api.ftp import Database, Directory, File +from pysus.utils import UFs, parse_UFs, to_list, zfill_year class SIM(Database): diff --git a/pysus/ftp/databases/sinan.py b/pysus/api/ftp/databases/sinan.py similarity index 94% rename from pysus/ftp/databases/sinan.py rename to pysus/api/ftp/databases/sinan.py index ccc3ae80..f272d016 100644 --- a/pysus/ftp/databases/sinan.py +++ b/pysus/api/ftp/databases/sinan.py @@ -2,8 +2,8 @@ from typing import List, Optional, Union -from pysus.ftp import Database, Directory, File -from pysus.ftp.utils import to_list, zfill_year +from pysus.api.ftp import Database, Directory, File +from pysus.utils import to_list, zfill_year class SINAN(Database): @@ -122,9 +122,8 @@ def get_files( year: Optional[Union[str, int, list]] = None, ) -> List[File]: files = list( - filter( - lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files - ) + filter(lambda f: f.extension.upper() + in [".DBC", ".DBF"], self.files) ) if dis_code: @@ -132,8 +131,8 @@ def get_files( if codes and not all(code in self.diseases for code in codes): raise ValueError( - "Unknown disease(s): " - f"{set(codes).difference(set(self.diseases))}" + f"Unknown disease(s): {set( + codes).difference(set(self.diseases))}" ) files = list(filter(lambda f: self.format(f)[0] in codes, files)) diff --git a/pysus/ftp/databases/sinasc.py b/pysus/api/ftp/databases/sinasc.py similarity index 95% rename from pysus/ftp/databases/sinasc.py rename to pysus/api/ftp/databases/sinasc.py index aaac7b63..f7e73c29 100644 --- a/pysus/ftp/databases/sinasc.py +++ b/pysus/api/ftp/databases/sinasc.py @@ -2,8 +2,8 @@ from typing import List, Optional, Union -from pysus.ftp import Database, Directory, File -from pysus.ftp.utils import UFs, parse_UFs, to_list, zfill_year +from pysus.api.ftp import Database, Directory, File +from pysus.utils import UFs, parse_UFs, to_list, zfill_year class SINASC(Database): diff --git a/pysus/data/local.py b/pysus/data/local.py index 5ea7476e..c9346deb 100644 --- a/pysus/data/local.py +++ b/pysus/data/local.py @@ -8,10 +8,6 @@ class ParquetSet: - """ - A local parquet directory or file - """ - __path__: Union[PurePosixPath, PureWindowsPath] info: Dict @@ -41,6 +37,9 @@ def __init__(self, path: str, _pbar=None) -> None: def __str__(self): return str(self.__path__) + def __fspath__(self): + return str(self) + def __repr__(self): return str(self.__path__) @@ -57,9 +56,8 @@ def to_dataframe(self) -> pd.DataFrame: parquets into a single dataframe """ parquets = list(map(str, self.__path__.glob("*.parquet"))) - chunks_list = [ - pd.read_parquet(str(f), engine="fastparquet") for f in parquets - ] + chunks_list = [pd.read_parquet( + str(f), engine="fastparquet") for f in parquets] _df = pd.concat(chunks_list, ignore_index=True) return parse_dftypes(_df) diff --git a/pysus/metadata/SINAN/ANIM.tar.gz b/pysus/data/metadata/SINAN/ANIM.tar.gz similarity index 100% rename from pysus/metadata/SINAN/ANIM.tar.gz rename to pysus/data/metadata/SINAN/ANIM.tar.gz diff --git a/pysus/metadata/SINAN/BOTU.tar.gz b/pysus/data/metadata/SINAN/BOTU.tar.gz similarity index 100% rename from pysus/metadata/SINAN/BOTU.tar.gz rename to pysus/data/metadata/SINAN/BOTU.tar.gz diff --git a/pysus/metadata/SINAN/CHAG.tar.gz b/pysus/data/metadata/SINAN/CHAG.tar.gz similarity index 100% rename from pysus/metadata/SINAN/CHAG.tar.gz rename to pysus/data/metadata/SINAN/CHAG.tar.gz diff --git a/pysus/metadata/SINAN/CHIK.tar.gz b/pysus/data/metadata/SINAN/CHIK.tar.gz similarity index 100% rename from pysus/metadata/SINAN/CHIK.tar.gz rename to pysus/data/metadata/SINAN/CHIK.tar.gz diff --git a/pysus/metadata/SINAN/COLE.tar.gz b/pysus/data/metadata/SINAN/COLE.tar.gz similarity index 100% rename from pysus/metadata/SINAN/COLE.tar.gz rename to pysus/data/metadata/SINAN/COLE.tar.gz diff --git a/pysus/metadata/SINAN/COQU.tar.gz b/pysus/data/metadata/SINAN/COQU.tar.gz similarity index 100% rename from pysus/metadata/SINAN/COQU.tar.gz rename to pysus/data/metadata/SINAN/COQU.tar.gz diff --git a/pysus/metadata/SINAN/DENG.tar.gz b/pysus/data/metadata/SINAN/DENG.tar.gz similarity index 100% rename from pysus/metadata/SINAN/DENG.tar.gz rename to pysus/data/metadata/SINAN/DENG.tar.gz diff --git a/pysus/metadata/SINAN/DIFT.tar.gz b/pysus/data/metadata/SINAN/DIFT.tar.gz similarity index 100% rename from pysus/metadata/SINAN/DIFT.tar.gz rename to pysus/data/metadata/SINAN/DIFT.tar.gz diff --git a/pysus/metadata/SINAN/ESQU.tar.gz b/pysus/data/metadata/SINAN/ESQU.tar.gz similarity index 100% rename from pysus/metadata/SINAN/ESQU.tar.gz rename to pysus/data/metadata/SINAN/ESQU.tar.gz diff --git a/pysus/metadata/SINAN/FAMA.tar.gz b/pysus/data/metadata/SINAN/FAMA.tar.gz similarity index 100% rename from pysus/metadata/SINAN/FAMA.tar.gz rename to pysus/data/metadata/SINAN/FAMA.tar.gz diff --git a/pysus/metadata/SINAN/FMAC.tar.gz b/pysus/data/metadata/SINAN/FMAC.tar.gz similarity index 100% rename from pysus/metadata/SINAN/FMAC.tar.gz rename to pysus/data/metadata/SINAN/FMAC.tar.gz diff --git a/pysus/metadata/SINAN/FTIF.tar.gz b/pysus/data/metadata/SINAN/FTIF.tar.gz similarity index 100% rename from pysus/metadata/SINAN/FTIF.tar.gz rename to pysus/data/metadata/SINAN/FTIF.tar.gz diff --git a/pysus/metadata/SINAN/HANS.tar.gz b/pysus/data/metadata/SINAN/HANS.tar.gz similarity index 100% rename from pysus/metadata/SINAN/HANS.tar.gz rename to pysus/data/metadata/SINAN/HANS.tar.gz diff --git a/pysus/metadata/SINAN/HANT.tar.gz b/pysus/data/metadata/SINAN/HANT.tar.gz similarity index 100% rename from pysus/metadata/SINAN/HANT.tar.gz rename to pysus/data/metadata/SINAN/HANT.tar.gz diff --git a/pysus/metadata/SINAN/HEPA.tar.gz b/pysus/data/metadata/SINAN/HEPA.tar.gz similarity index 100% rename from pysus/metadata/SINAN/HEPA.tar.gz rename to pysus/data/metadata/SINAN/HEPA.tar.gz diff --git a/pysus/metadata/SINAN/IEXO.tar.gz b/pysus/data/metadata/SINAN/IEXO.tar.gz similarity index 100% rename from pysus/metadata/SINAN/IEXO.tar.gz rename to pysus/data/metadata/SINAN/IEXO.tar.gz diff --git a/pysus/metadata/SINAN/LEIV.tar.gz b/pysus/data/metadata/SINAN/LEIV.tar.gz similarity index 100% rename from pysus/metadata/SINAN/LEIV.tar.gz rename to pysus/data/metadata/SINAN/LEIV.tar.gz diff --git a/pysus/metadata/SINAN/LEPT.tar.gz b/pysus/data/metadata/SINAN/LEPT.tar.gz similarity index 100% rename from pysus/metadata/SINAN/LEPT.tar.gz rename to pysus/data/metadata/SINAN/LEPT.tar.gz diff --git a/pysus/metadata/SINAN/LTAN.tar.gz b/pysus/data/metadata/SINAN/LTAN.tar.gz similarity index 100% rename from pysus/metadata/SINAN/LTAN.tar.gz rename to pysus/data/metadata/SINAN/LTAN.tar.gz diff --git a/pysus/metadata/SINAN/MALA.tar.gz b/pysus/data/metadata/SINAN/MALA.tar.gz similarity index 100% rename from pysus/metadata/SINAN/MALA.tar.gz rename to pysus/data/metadata/SINAN/MALA.tar.gz diff --git a/pysus/metadata/SINAN/MENI.tar.gz b/pysus/data/metadata/SINAN/MENI.tar.gz similarity index 100% rename from pysus/metadata/SINAN/MENI.tar.gz rename to pysus/data/metadata/SINAN/MENI.tar.gz diff --git a/pysus/metadata/SINAN/PEST.tar.gz b/pysus/data/metadata/SINAN/PEST.tar.gz similarity index 100% rename from pysus/metadata/SINAN/PEST.tar.gz rename to pysus/data/metadata/SINAN/PEST.tar.gz diff --git a/pysus/metadata/SINAN/RAIV.tar.gz b/pysus/data/metadata/SINAN/RAIV.tar.gz similarity index 100% rename from pysus/metadata/SINAN/RAIV.tar.gz rename to pysus/data/metadata/SINAN/RAIV.tar.gz diff --git a/pysus/metadata/SINAN/SIFC.tar.gz b/pysus/data/metadata/SINAN/SIFC.tar.gz similarity index 100% rename from pysus/metadata/SINAN/SIFC.tar.gz rename to pysus/data/metadata/SINAN/SIFC.tar.gz diff --git a/pysus/metadata/SINAN/SIFG.tar.gz b/pysus/data/metadata/SINAN/SIFG.tar.gz similarity index 100% rename from pysus/metadata/SINAN/SIFG.tar.gz rename to pysus/data/metadata/SINAN/SIFG.tar.gz diff --git a/pysus/metadata/SINAN/TETA.tar.gz b/pysus/data/metadata/SINAN/TETA.tar.gz similarity index 100% rename from pysus/metadata/SINAN/TETA.tar.gz rename to pysus/data/metadata/SINAN/TETA.tar.gz diff --git a/pysus/metadata/SINAN/TETN.tar.gz b/pysus/data/metadata/SINAN/TETN.tar.gz similarity index 100% rename from pysus/metadata/SINAN/TETN.tar.gz rename to pysus/data/metadata/SINAN/TETN.tar.gz diff --git a/pysus/metadata/SINAN/TUBE.tar.gz b/pysus/data/metadata/SINAN/TUBE.tar.gz similarity index 100% rename from pysus/metadata/SINAN/TUBE.tar.gz rename to pysus/data/metadata/SINAN/TUBE.tar.gz diff --git a/pysus/metadata/SINAN/typecast.py b/pysus/data/metadata/SINAN/typecast.py similarity index 100% rename from pysus/metadata/SINAN/typecast.py rename to pysus/data/metadata/SINAN/typecast.py diff --git a/pysus/data/metadata/__init__.py b/pysus/data/metadata/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pysus/preprocessing/SIM.py b/pysus/data/preprocessing/SIM.py similarity index 100% rename from pysus/preprocessing/SIM.py rename to pysus/data/preprocessing/SIM.py diff --git a/pysus/preprocessing/__init__.py b/pysus/data/preprocessing/__init__.py similarity index 100% rename from pysus/preprocessing/__init__.py rename to pysus/data/preprocessing/__init__.py diff --git a/pysus/online_data/CIHA.py b/pysus/data/remote/CIHA.py similarity index 90% rename from pysus/online_data/CIHA.py rename to pysus/data/remote/CIHA.py index 9be4ecc0..475aec7e 100644 --- a/pysus/online_data/CIHA.py +++ b/pysus/data/remote/CIHA.py @@ -6,19 +6,19 @@ by fccoelho license: GPL V3 or Later """ + from typing import Union from loguru import logger -from pysus.ftp import CACHEPATH -from pysus.ftp.databases.ciha import CIHA -from pysus.ftp.utils import parse_UFs +from pysus.api.ftp import CACHEPATH, CIHA +from pysus.utils.brasil import parse_UFs ciha = CIHA().load() def get_available_years( states: Union[list, str] = None, -) -> dict[str : set[int]]: +) -> dict[str: set[int]]: """ Fetch available years for the `states`. :param states: UF code. E.g: "SP" or ["SP", "RJ"] diff --git a/pysus/online_data/CNES.py b/pysus/data/remote/CNES.py similarity index 96% rename from pysus/online_data/CNES.py rename to pysus/data/remote/CNES.py index a3b1188e..1881b558 100644 --- a/pysus/online_data/CNES.py +++ b/pysus/data/remote/CNES.py @@ -1,9 +1,8 @@ from typing import Union from loguru import logger -from pysus.ftp import CACHEPATH -from pysus.ftp.databases.cnes import CNES -from pysus.ftp.utils import parse_UFs +from pysus.api.ftp import CACHEPATH, CNES +from pysus.utils.brasil import parse_UFs cnes = CNES().load() diff --git a/pysus/online_data/IBGE.py b/pysus/data/remote/IBGE.py similarity index 97% rename from pysus/online_data/IBGE.py rename to pysus/data/remote/IBGE.py index 33fba909..5646a3a9 100644 --- a/pysus/online_data/IBGE.py +++ b/pysus/data/remote/IBGE.py @@ -13,7 +13,7 @@ import requests import urllib3 from pysus.data.local import ParquetSet -from pysus.ftp.databases.ibge_datasus import IBGEDATASUS +from pysus.api.ftp import IBGEDATASUS # requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL:@SECLEVEL=1' @@ -296,12 +296,10 @@ class FetchData: resultados vêm a partir do segundo elemento. """ - def __init__( - self, agregado: int, periodos: str, variavel: str = "allxp", **kwargs - ): + def __init__(self, agregado: int, periodos: str, variavel: str = "allxp", **kwargs): self.url = ( - APIBASE - + f"agregados/{agregado}/periodos/{periodos}/variaveis/{variavel}?" + APIBASE + + f"agregados/{agregado}/periodos/{periodos}/variaveis/{variavel}?" ) self.url += "&".join([f"{k}={v}" for k, v in kwargs.items()]) self.JSON = None @@ -390,8 +388,7 @@ def get_population( opts = ["ALF", "ESCA", "ESCB", "IDOSO", "RENDA"] if not censo_data or censo_data not in opts: raise ValueError( - f"Incorrect 'censo_data' parameter. Options: {opts}" - ) + f"Incorrect 'censo_data' parameter. Options: {opts}") file = [f for f in files if censo_data in f.name][0].download() else: file = files[0].download() @@ -415,8 +412,6 @@ def _unzip_to_dataframe(file: str) -> pd.DataFrame: return pd.read_csv(zip_file.extract(file, tempdir)) if file.lower().endswith((".dbf", ".dbc")): - return ParquetSet( - zip_file.extract(file, tempdir) - ).to_dataframe() + return ParquetSet(zip_file.extract(file, tempdir)).to_dataframe() raise ValueError(f"No data found in {zip_file}") diff --git a/pysus/online_data/Infodengue.py b/pysus/data/remote/Infodengue.py similarity index 100% rename from pysus/online_data/Infodengue.py rename to pysus/data/remote/Infodengue.py diff --git a/pysus/online_data/PNI.py b/pysus/data/remote/PNI.py similarity index 93% rename from pysus/online_data/PNI.py rename to pysus/data/remote/PNI.py index 2df41c1c..b9f60b80 100644 --- a/pysus/online_data/PNI.py +++ b/pysus/data/remote/PNI.py @@ -1,12 +1,12 @@ """ Download data from the national immunization program """ + from typing import Literal, Union from loguru import logger -from pysus.ftp import CACHEPATH -from pysus.ftp.databases.pni import PNI -from pysus.ftp.utils import parse_UFs +from pysus.api.ftp import CACHEPATH, PNI +from pysus.utils.brasil import parse_UFs pni = PNI().load() diff --git a/pysus/online_data/SIA.py b/pysus/data/remote/SIA.py similarity index 96% rename from pysus/online_data/SIA.py rename to pysus/data/remote/SIA.py index 19ff22a4..6b3b8316 100644 --- a/pysus/online_data/SIA.py +++ b/pysus/data/remote/SIA.py @@ -6,13 +6,13 @@ by bcbernardo license: GPL V3 or Later """ + from pprint import pprint from typing import Dict, Tuple, Union from loguru import logger -from pysus.ftp import CACHEPATH -from pysus.ftp.databases.sia import SIA -from pysus.ftp.utils import parse_UFs +from pysus.api.ftp import CACHEPATH, SIA +from pysus.utils.brasil import parse_UFs sia = SIA().load() diff --git a/pysus/online_data/SIH.py b/pysus/data/remote/SIH.py similarity index 94% rename from pysus/online_data/SIH.py rename to pysus/data/remote/SIH.py index 67749f51..523833b9 100644 --- a/pysus/online_data/SIH.py +++ b/pysus/data/remote/SIH.py @@ -4,12 +4,12 @@ by fccoelho license: GPL V3 or Later """ + from typing import Union from loguru import logger -from pysus.ftp import CACHEPATH -from pysus.ftp.databases.sih import SIH -from pysus.ftp.utils import parse_UFs +from pysus.api.ftp import CACHEPATH, SIH +from pysus.utils.brasil import parse_UFs sih = SIH().load() diff --git a/pysus/online_data/SIM.py b/pysus/data/remote/SIM.py similarity index 91% rename from pysus/online_data/SIM.py rename to pysus/data/remote/SIM.py index c021111b..79908e76 100644 --- a/pysus/online_data/SIM.py +++ b/pysus/data/remote/SIM.py @@ -4,6 +4,7 @@ by fccoelho license: GPL V3 or Later """ + import os from ftplib import FTP, error_perm from typing import Union @@ -11,9 +12,8 @@ import pandas as pd from dbfread import DBF from loguru import logger -from pysus.ftp import CACHEPATH -from pysus.ftp.databases.sim import SIM -from pysus.ftp.utils import parse_UFs +from pysus.api.ftp import CACHEPATH, SIM +from pysus.utils.brasil import parse_UFs sim = SIM().load() @@ -68,17 +68,14 @@ def get_CID10_chapters_table(cache=True): ftp = FTP("ftp.datasus.gov.br") ftp.login() logger.debug( - f"Stablishing connection with ftp.datasus.gov.br.\n{ftp.welcome}" - ) + f"Stablishing connection with ftp.datasus.gov.br.\n{ftp.welcome}") ftp.cwd("/dissemin/publicos/SIM/CID10/TABELAS") logger.debug( - "Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS" - ) + "Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS") fname = "CIDCAP10.DBF" cachefile = os.path.join( - CACHEPATH, "SIM_" + fname.split(".")[0] + "_.parquet" - ) + CACHEPATH, "SIM_" + fname.split(".")[0] + "_.parquet") if os.path.exists(cachefile): logger.info(f"Local parquet file found at {cachefile}") @@ -114,17 +111,14 @@ def get_CID10_table(cache=True): ftp = FTP("ftp.datasus.gov.br") ftp.login() logger.debug( - f"Stablishing connection with ftp.datasus.gov.br.\n{ftp.welcome}" - ) + f"Stablishing connection with ftp.datasus.gov.br.\n{ftp.welcome}") ftp.cwd("/dissemin/publicos/SIM/CID10/TABELAS") logger.debug( - "Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS" - ) + "Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS") fname = "CID10.DBF" cachefile = os.path.join( - CACHEPATH, "SIM_" + fname.split(".")[0] + "_.parquet" - ) + CACHEPATH, "SIM_" + fname.split(".")[0] + "_.parquet") if os.path.exists(cachefile): logger.info(f"Local parquet file found at {cachefile}") @@ -160,17 +154,14 @@ def get_CID9_table(cache=True): ftp = FTP("ftp.datasus.gov.br") ftp.login() logger.debug( - f"Stablishing connection with ftp.datasus.gov.br.\n{ftp.welcome}" - ) + f"Stablishing connection with ftp.datasus.gov.br.\n{ftp.welcome}") ftp.cwd("/dissemin/publicos/SIM/CID9/TABELAS") logger.debug( - "Changing FTP work dir to: /dissemin/publicos/SIM/CID9/TABELAS" - ) + "Changing FTP work dir to: /dissemin/publicos/SIM/CID9/TABELAS") fname = "CID9.DBF" cachefile = os.path.join( - CACHEPATH, "SIM_" + fname.split(".")[0] + "_.parquet" - ) + CACHEPATH, "SIM_" + fname.split(".")[0] + "_.parquet") if os.path.exists(cachefile): logger.info(f"Local parquet file found at {cachefile}") @@ -206,17 +197,14 @@ def get_municipios(cache=True): ftp = FTP("ftp.datasus.gov.br") ftp.login() logger.debug( - f"Stablishing connection with ftp.datasus.gov.br.\n{ftp.welcome}" - ) + f"Stablishing connection with ftp.datasus.gov.br.\n{ftp.welcome}") ftp.cwd("/dissemin/publicos/SIM/CID10/TABELAS") logger.debug( - "Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS" - ) + "Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS") fname = "CADMUN.DBF" cachefile = os.path.join( - CACHEPATH, "SIM_" + fname.split(".")[0] + "_.parquet" - ) + CACHEPATH, "SIM_" + fname.split(".")[0] + "_.parquet") if os.path.exists(cachefile): logger.info(f"Local parquet file found at {cachefile}") @@ -252,16 +240,13 @@ def get_ocupations(cache=True): ftp = FTP("ftp.datasus.gov.br") ftp.login() logger.debug( - f"Stablishing connection with ftp.datasus.gov.br.\n{ftp.welcome}" - ) + f"Stablishing connection with ftp.datasus.gov.br.\n{ftp.welcome}") ftp.cwd("/dissemin/publicos/SIM/CID10/TABELAS") logger.debug( - "Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS" - ) + "Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS") fname = "TABOCUP.DBF" cachefile = os.path.join( - CACHEPATH, "SIM_" + fname.split(".")[0] + "_.parquet" - ) + CACHEPATH, "SIM_" + fname.split(".")[0] + "_.parquet") if os.path.exists(cachefile): logger.info(f"Local parquet file found at {cachefile}") diff --git a/pysus/online_data/SINAN.py b/pysus/data/remote/SINAN.py similarity index 89% rename from pysus/online_data/SINAN.py rename to pysus/data/remote/SINAN.py index fe5692db..abef5277 100644 --- a/pysus/online_data/SINAN.py +++ b/pysus/data/remote/SINAN.py @@ -2,8 +2,7 @@ from typing import Union import pandas as pd -from pysus.ftp import CACHEPATH -from pysus.ftp.databases.sinan import SINAN +from pysus.api.ftp import CACHEPATH, SINAN sinan = SINAN().load() @@ -43,10 +42,8 @@ def download( def metadata_df(disease_code: str) -> pd.DataFrame: metadata_file = ( - Path(__file__).parent.parent - / "metadata" - / "SINAN" - / f"{disease_code}.tar.gz" + Path(__file__).parent.parent / "metadata" / + "SINAN" / f"{disease_code}.tar.gz" ) if metadata_file.exists(): df = pd.read_csv( diff --git a/pysus/online_data/SINASC.py b/pysus/data/remote/SINASC.py similarity index 92% rename from pysus/online_data/SINASC.py rename to pysus/data/remote/SINASC.py index 2469d88a..5307475a 100644 --- a/pysus/online_data/SINASC.py +++ b/pysus/data/remote/SINASC.py @@ -4,12 +4,12 @@ by fccoelho license: GPL V3 or Later """ + from typing import Union from loguru import logger -from pysus.ftp import CACHEPATH -from pysus.ftp.databases.sinasc import SINASC -from pysus.ftp.utils import parse_UFs +from pysus.api.ftp import CACHEPATH, SINASC +from pysus.utils.brasil import parse_UFs sinasc = SINASC().load() diff --git a/pysus/online_data/__init__.py b/pysus/data/remote/__init__.py similarity index 100% rename from pysus/online_data/__init__.py rename to pysus/data/remote/__init__.py diff --git a/pysus/online_data/territory.py b/pysus/data/remote/territory.py similarity index 92% rename from pysus/online_data/territory.py rename to pysus/data/remote/territory.py index 404a5ad2..7ee6306d 100644 --- a/pysus/online_data/territory.py +++ b/pysus/data/remote/territory.py @@ -1,6 +1,6 @@ from typing import List, Union -from pysus.ftp import CACHEPATH, Directory, File +from pysus.api.ftp import CACHEPATH, Directory, File def list_tables() -> List[File]: diff --git a/pysus/online_data/vaccine.py b/pysus/data/remote/vaccine.py similarity index 98% rename from pysus/online_data/vaccine.py rename to pysus/data/remote/vaccine.py index 77399f7c..4f874334 100644 --- a/pysus/online_data/vaccine.py +++ b/pysus/data/remote/vaccine.py @@ -5,6 +5,7 @@ - COVID-19 in 2020-2021 Downloaded as described [here](http://opendatasus.saude.gov.br/dataset/b772ee55-07cd-44d8-958f-b12edd004e0b/resource/5916b3a4-81e7-4ad5-adb6-b884ff198dc1/download/manual_api_vacina_covid-19.pdf) # noqa """ + import json import os from json import JSONDecodeError @@ -12,7 +13,7 @@ import pandas as pd import requests from loguru import logger -from pysus.ftp import CACHEPATH +from pysus.api.ftp import CACHEPATH from requests.auth import HTTPBasicAuth @@ -41,8 +42,7 @@ def download_covid(uf=None, only_header=False): tempfile = os.path.join(CACHEPATH, f"Vaccine_temp_{UF}.csv.gz") if os.path.exists(tempfile): print( - "loading from cache. Returning an iterator of Dataframes in chunks" - " of 5000." + "loading from cache. Returning an iterator of Dataframes in chunks of 5000." ) return pd.read_csv(tempfile, chunksize=5000) diff --git a/pysus/ftp/utils.py b/pysus/ftp/utils.py deleted file mode 100644 index b700474d..00000000 --- a/pysus/ftp/utils.py +++ /dev/null @@ -1,28 +0,0 @@ -import datetime -from typing import Union - -from pysus.ftp import to_list -from pysus.utilities.brasil import MONTHS, UFs # noqa - - -def zfill_year(year: Union[str, int]) -> int: - """ - Formats a len(2) year into len(4) with the correct year preffix - E.g: 20 -> 2020; 99 -> 1999 - """ - year = str(year)[-2:].zfill(2) - current_year = str(datetime.datetime.now().year)[-2:] - suffix = "19" if str(year) > current_year else "20" - return int(suffix + str(year)) - - -def parse_UFs(UF: Union[list[str], str]) -> list: - """ - Formats states abbreviations into correct format and retuns a list. - Also checks if there is an incorrect UF in the list. - E.g: ['SC', 'mt', 'ba'] -> ['SC', 'MT', 'BA'] - """ - ufs = [uf.upper() for uf in to_list(UF)] - if not all(uf in list(UFs) for uf in ufs): - raise ValueError(f"Unknown UF(s): {set(ufs).difference(list(UFs))}") - return ufs diff --git a/pysus/online_data/Infogripe.py b/pysus/online_data/Infogripe.py deleted file mode 100644 index bd496c79..00000000 --- a/pysus/online_data/Infogripe.py +++ /dev/null @@ -1,23 +0,0 @@ -""" -Downloads data made available by the Infogripe service -""" - -import pandas as pd - -BASEURL = r"https://gitlab.fiocruz.br/marcelo.gomes/infogripe/-/raw/master/Dados/InfoGripe/" # noqa -DATASETS = { - "Alerta de situação": r"tabela_de_alerta.csv", - "Casos por idade, sexo e virus": r"dados_semanais_faixa_etaria_sexo_virus.csv.gz", # noqa - "Casos Totais e estimativas": r"serie_temporal_com_estimativas_recentes.csv.gz", # noqa - "Valores esperados por localidades": "valores_esperados_por_localidade.csv", # noqa -} - - -def list_datasets(): - return list(DATASETS.keys()) - - -def download(dataset_name): - url = BASEURL + DATASETS[dataset_name] + "?inline=false" - df = pd.read_csv(url, delimiter=";", decimal=",") - return df diff --git a/pysus/preprocessing/sinan.py b/pysus/preprocessing/sinan.py deleted file mode 100644 index cb6945ed..00000000 --- a/pysus/preprocessing/sinan.py +++ /dev/null @@ -1,127 +0,0 @@ -import os -from functools import lru_cache - -import geocoder -import numpy as np -import pandas as pd -import requests -from dbfread import DBF - - -def read_sinan_dbf(fname, encoding) -> pd.DataFrame: - """ - Read SINAN dbf file returning a Pandas Dataframe with - :param fname: dbf file name - :param encoding: Encoding of the dbf - :return: pandas dataframe - """ - db = DBF(fname, encoding=encoding) - df = pd.DataFrame(list(db)) - - def convert_week(x): - try: - w = int(x) % 100 - except ValueError: - w = np.nan - return w - - for cname in df.columns: - df[cname].replace("", np.nan, inplace=True) - if cname.startswith(("NU", "ID")): - try: - df[cname] = pd.to_numeric(df[cname]) - except ValueError as e: - print(f"Column {cname} could not be converted to numeric: {e}") - # certain IDs can be alphanumerical - pass - elif cname.startswith("SEM"): - df[cname] = df[cname].map(convert_week) - - return df - - -@lru_cache(maxsize=None) -def get_geocodes(geoc): - """ - Return city name and state two letter code from geocode - :param geoc: - :return: - """ - url = ( - "http://cidades.ibge.gov.br/services/jSonpMuns.php?" - "busca=330&featureClass=P&style=full&maxRows=5&name_startsWith={}" - ).format(geoc) - resp = requests.get(url) - for d in resp.json()["municipios"]: - if int(geoc) == int(d["c"]): - return [d["n"].encode("latin-1").decode("utf-8"), d["s"]] - - else: - raise KeyError("could not find geocode {} in ".format(geoc)) - - -def _address_generator(df, default=""): - for row in df.iterrows(): - line = dict(row[1]) - try: - line["cidade"] = ",".join(get_geocodes(line["ID_MN_RESI"])) - except KeyError: - print("Could not find geocode {} using default") - line["cidade"] = default - yield line[ - "NU_NOTIFIC" - ], "{NM_LOGRADO}, {NU_NUMERO}, {NM_BAIRRO}, {cidade}, Brasil".format( - **line - ) - - -def geocode(sinan_df, outfile, default_city): - """ - Geocode cases based on addresses included. - :param default_city: default city to use in case of bad Geocode found in - file. It can be "city, state" - :param sinan_df: Dataframe generated from sinan DBF - :param outfile: File on Which - """ - addrs = _address_generator(sinan_df, default_city) - if os.path.exists(outfile): - mode = "a" - coords = pd.read_csv(outfile) - geocoded = coords.NU_NOTIFIC.tolist() - else: - mode = "w" - geocoded = [] - with open(outfile, mode) as of: - if mode == "w": - of.write("NU_NOTIFIC,latitude,longitude\n") - for nu, ad in addrs: - # ad = ad.encode('latin-1').decode('utf-8') - if nu in geocoded: - continue - location = geocoder.google(ad) - if location is None: - raise NameError("Google could not find {}".format(ad)) - if location.latlng == []: - print( - ( - "Search for {} returned {} as coordinates, trying " - "reduced address:" - ).format(ad, location.latlng) - ) - ad = ",".join(ad.split(",")[2:]) - print(ad) - location = geocoder.google(ad) - try: - of.write( - "{},{},{}\n".format( - nu, location.latlng[0], location.latlng[1] - ) - ) - print("Successfully geolocated {}".format(ad)) - except IndexError: - print( - ( - "Search for {} returned {} as coordinates, " "skipping" - ).format(ad, location.latlng) - ) - of.write("{},nan,nan\n".format(nu)) diff --git a/pysus/utils/__init__.py b/pysus/utils/__init__.py new file mode 100644 index 00000000..7414d65c --- /dev/null +++ b/pysus/utils/__init__.py @@ -0,0 +1,25 @@ +import datetime +from typing import Union, TypeVar, List, Tuple + +from .brasil import * # noqa + + +T = TypeVar("T") + + +def to_list(item: Union[T, List[T], Tuple[T, ...], None]) -> List[T]: + """Parse any builtin data type into a list""" + if item is None: + return [] + return [item] if not isinstance(item, (list, tuple)) else list(item) + + +def zfill_year(year: Union[str, int]) -> int: + """ + Formats a len(2) year into len(4) with the correct year preffix + E.g: 20 -> 2020; 99 -> 1999 + """ + year = str(year)[-2:].zfill(2) + current_year = str(datetime.datetime.now().year)[-2:] + suffix = "19" if str(year) > current_year else "20" + return int(suffix + str(year)) diff --git a/pysus/utilities/brasil.py b/pysus/utils/brasil.py similarity index 70% rename from pysus/utilities/brasil.py rename to pysus/utils/brasil.py index 0024a7e4..ade8b406 100644 --- a/pysus/utilities/brasil.py +++ b/pysus/utils/brasil.py @@ -1,7 +1,18 @@ +__all__ = [ + "MUNICIPALITIES", + "MUN_BY_GEOCODE", + "UFs", + "MONTHS", + "get_city_name_by_geocode", + "parse_UFs", +] + import json from pathlib import Path from typing import Union +from pysus.utils import to_list + with open( f"{Path(__file__).parent}/municipios.json", "r", encoding="utf-8-sig" ) as muns: @@ -65,3 +76,15 @@ def get_city_name_by_geocode(geocode: Union[str, int]): """ return MUN_BY_GEOCODE[int(geocode)] + + +def parse_UFs(UF: Union[list[str], str]) -> list: + """ + Formats states abbreviations into correct format and retuns a list. + Also checks if there is an incorrect UF in the list. + E.g: ['SC', 'mt', 'ba'] -> ['SC', 'MT', 'BA'] + """ + ufs = [uf.upper() for uf in to_list(UF)] + if not all(uf in list(UFs) for uf in ufs): + raise ValueError(f"Unknown UF(s): {set(ufs).difference(list(UFs))}") + return ufs diff --git a/pysus/preprocessing/decoders.py b/pysus/utils/decoders.py similarity index 94% rename from pysus/preprocessing/decoders.py rename to pysus/utils/decoders.py index 23215a6c..710824ca 100644 --- a/pysus/preprocessing/decoders.py +++ b/pysus/utils/decoders.py @@ -6,6 +6,21 @@ license: GPL V3 or Later """ +__all__ = [ + "decodifica_idade_SINAN", + "get_age_string", + "decodifica_idade_SIM", + "decodifica_data_SIM", + "is_valid_geocode", + "get_valid_geocodes", + "calculate_digit", + "add_dv", + "columns_as_category", + "translate_variables_SIM", + "classify_age", + "get_CID10_code_index", +] + __docformat__ = "restructuredtext en" import re from datetime import datetime, timedelta @@ -182,8 +197,7 @@ def translate_variables_SIM( # SEXO if "SEXO" in variables_names: df["SEXO"] = df.SEXO.replace( - {0: None, 9: None, 1: "Masculino", 2: "Feminino"} - ) + {0: None, 9: None, 1: "Masculino", 2: "Feminino"}) df["SEXO"] = df["SEXO"].astype("category") df["SEXO"] = df["SEXO"].cat.add_categories(["NA"]) df["SEXO"] = df["SEXO"].fillna("NA") @@ -287,9 +301,8 @@ def get_CID10_code_index(datasus_chapters): number_range_start = int(chapter_range[0][1:3]) number_range_finish = int(chapter_range[1][1:3]) for code in range(number_range_start, number_range_finish + 1): - code_index[f"{start_letter}{str(code).zfill(2)}"] = ( - ch_array_index + 1 - ) + code_index[f"{start_letter}{ + str(code).zfill(2)}"] = ch_array_index + 1 else: string_range_start = chapter_range[0][0] string_range_end = chapter_range[1][0] @@ -309,9 +322,7 @@ def get_CID10_code_index(datasus_chapters): else: # Middle letters number_range_start = 0 number_range_end = 99 - for code_number in range( - number_range_start, number_range_end + 1 - ): + for code_number in range(number_range_start, number_range_end + 1): code_index[f"{letter}{str(code_number).zfill(2)}"] = ( ch_array_index + 1 ) diff --git a/pysus/dataset/geocode_by_cities.json b/pysus/utils/geocode_by_cities.json similarity index 100% rename from pysus/dataset/geocode_by_cities.json rename to pysus/utils/geocode_by_cities.json diff --git a/pysus/utilities/municipios.json b/pysus/utils/municipios.json similarity index 100% rename from pysus/utilities/municipios.json rename to pysus/utils/municipios.json From cb6a314a87be261c63fcb76acc142662808c40b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=A3=20Bida=20Vacaro?= Date: Tue, 3 Mar 2026 14:05:36 -0300 Subject: [PATCH 3/6] remove circular imports --- pysus/utils/brasil.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pysus/utils/brasil.py b/pysus/utils/brasil.py index ade8b406..c1d59f3b 100644 --- a/pysus/utils/brasil.py +++ b/pysus/utils/brasil.py @@ -11,8 +11,6 @@ from pathlib import Path from typing import Union -from pysus.utils import to_list - with open( f"{Path(__file__).parent}/municipios.json", "r", encoding="utf-8-sig" ) as muns: @@ -84,7 +82,12 @@ def parse_UFs(UF: Union[list[str], str]) -> list: Also checks if there is an incorrect UF in the list. E.g: ['SC', 'mt', 'ba'] -> ['SC', 'MT', 'BA'] """ - ufs = [uf.upper() for uf in to_list(UF)] - if not all(uf in list(UFs) for uf in ufs): - raise ValueError(f"Unknown UF(s): {set(ufs).difference(list(UFs))}") + ufs = [uf.upper() for uf in ([UF] if isinstance(UF, str) else UF)] + + valid_ufs = set(UFs) + invalid = set(ufs).difference(valid_ufs) + + if invalid: + raise ValueError(f"Unknown UF(s): {invalid}") + return ufs From 15a90c4fbf689e92338ddb8ab423f8bbafb89b80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=A3=20Bida=20Vacaro?= Date: Sun, 15 Mar 2026 15:40:29 -0300 Subject: [PATCH 4/6] move individual database files to databases.py --- .gitignore | 1 + poetry.lock | 189 +++- pyproject.toml | 2 + pysus/__init__.py | 18 +- .../catalog/models.py => __init__.py} | 0 pysus/api/dadosgov/models.py | 47 +- pysus/api/ducklake/catalog.py | 0 pysus/api/ducklake/client.py | 56 ++ pysus/api/ducklake/models.py | 167 ++++ pysus/api/ducklake/storage.py | 0 pysus/api/ftp/__init__.py | 25 + pysus/api/ftp/client.py | 11 +- pysus/api/ftp/databases.py | 892 ++++++++++++++++++ pysus/api/ftp/databases/__init__.py | 34 - pysus/api/ftp/databases/ciha.py | 103 -- pysus/api/ftp/databases/cnes.py | 135 --- pysus/api/ftp/databases/ibge_datasus.py | 86 -- pysus/api/ftp/databases/pni.py | 95 -- pysus/api/ftp/databases/sia.py | 122 --- pysus/api/ftp/databases/sih.py | 105 --- pysus/api/ftp/databases/sim.py | 69 -- pysus/api/ftp/databases/sinan.py | 144 --- pysus/api/ftp/databases/sinasc.py | 82 -- 23 files changed, 1388 insertions(+), 995 deletions(-) rename pysus/api/{ducklake/catalog/models.py => __init__.py} (100%) create mode 100644 pysus/api/ducklake/catalog.py create mode 100644 pysus/api/ducklake/models.py create mode 100644 pysus/api/ducklake/storage.py create mode 100644 pysus/api/ftp/databases.py delete mode 100644 pysus/api/ftp/databases/__init__.py delete mode 100644 pysus/api/ftp/databases/ciha.py delete mode 100644 pysus/api/ftp/databases/cnes.py delete mode 100644 pysus/api/ftp/databases/ibge_datasus.py delete mode 100644 pysus/api/ftp/databases/pni.py delete mode 100644 pysus/api/ftp/databases/sia.py delete mode 100644 pysus/api/ftp/databases/sih.py delete mode 100644 pysus/api/ftp/databases/sim.py delete mode 100644 pysus/api/ftp/databases/sinan.py delete mode 100644 pysus/api/ftp/databases/sinasc.py diff --git a/.gitignore b/.gitignore index ebed1ee5..7364e04d 100644 --- a/.gitignore +++ b/.gitignore @@ -179,6 +179,7 @@ dmypy.json # pytype static type analyzer .pytype/ +.pylintrc # Cython debug symbols cython_debug/ diff --git a/poetry.lock b/poetry.lock index 3454fabd..f14f4f85 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1028,6 +1028,23 @@ files = [ [package.extras] all = ["adbc-driver-manager", "fsspec", "ipython", "numpy", "pandas", "pyarrow"] +[[package]] +name = "duckdb-engine" +version = "0.17.0" +description = "SQLAlchemy driver for duckdb" +optional = false +python-versions = "<4,>=3.9" +groups = ["main"] +files = [ + {file = "duckdb_engine-0.17.0-py3-none-any.whl", hash = "sha256:3aa72085e536b43faab635f487baf77ddc5750069c16a2f8d9c6c3cb6083e979"}, + {file = "duckdb_engine-0.17.0.tar.gz", hash = "sha256:396b23869754e536aa80881a92622b8b488015cf711c5a40032d05d2cf08f3cf"}, +] + +[package.dependencies] +duckdb = ">=0.5.0" +packaging = ">=21" +sqlalchemy = ">=1.3.22" + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -1359,6 +1376,74 @@ ratelim = "*" requests = "*" six = "*" +[[package]] +name = "greenlet" +version = "3.3.2" +description = "Lightweight in-process concurrent programming" +optional = false +python-versions = ">=3.10" +groups = ["main"] +markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\"" +files = [ + {file = "greenlet-3.3.2-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:9bc885b89709d901859cf95179ec9f6bb67a3d2bb1f0e88456461bd4b7f8fd0d"}, + {file = "greenlet-3.3.2-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b568183cf65b94919be4438dc28416b234b678c608cafac8874dfeeb2a9bbe13"}, + {file = "greenlet-3.3.2-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:527fec58dc9f90efd594b9b700662ed3fb2493c2122067ac9c740d98080a620e"}, + {file = "greenlet-3.3.2-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:508c7f01f1791fbc8e011bd508f6794cb95397fdb198a46cb6635eb5b78d85a7"}, + {file = "greenlet-3.3.2-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ad0c8917dd42a819fe77e6bdfcb84e3379c0de956469301d9fd36427a1ca501f"}, + {file = "greenlet-3.3.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:97245cc10e5515dbc8c3104b2928f7f02b6813002770cfaffaf9a6e0fc2b94ef"}, + {file = "greenlet-3.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8c1fdd7d1b309ff0da81d60a9688a8bd044ac4e18b250320a96fc68d31c209ca"}, + {file = "greenlet-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:5d0e35379f93a6d0222de929a25ab47b5eb35b5ef4721c2b9cbcc4036129ff1f"}, + {file = "greenlet-3.3.2-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:c56692189a7d1c7606cb794be0a8381470d95c57ce5be03fb3d0ef57c7853b86"}, + {file = "greenlet-3.3.2-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ebd458fa8285960f382841da585e02201b53a5ec2bac6b156fc623b5ce4499f"}, + {file = "greenlet-3.3.2-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a443358b33c4ec7b05b79a7c8b466f5d275025e750298be7340f8fc63dff2a55"}, + {file = "greenlet-3.3.2-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4375a58e49522698d3e70cc0b801c19433021b5c37686f7ce9c65b0d5c8677d2"}, + {file = "greenlet-3.3.2-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8e2cd90d413acbf5e77ae41e5d3c9b3ac1d011a756d7284d7f3f2b806bbd6358"}, + {file = "greenlet-3.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:442b6057453c8cb29b4fb36a2ac689382fc71112273726e2423f7f17dc73bf99"}, + {file = "greenlet-3.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:45abe8eb6339518180d5a7fa47fa01945414d7cca5ecb745346fc6a87d2750be"}, + {file = "greenlet-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:1e692b2dae4cc7077cbb11b47d258533b48c8fde69a33d0d8a82e2fe8d8531d5"}, + {file = "greenlet-3.3.2-cp311-cp311-win_arm64.whl", hash = "sha256:02b0a8682aecd4d3c6c18edf52bc8e51eacdd75c8eac52a790a210b06aa295fd"}, + {file = "greenlet-3.3.2-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:ac8d61d4343b799d1e526db579833d72f23759c71e07181c2d2944e429eb09cd"}, + {file = "greenlet-3.3.2-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ceec72030dae6ac0c8ed7591b96b70410a8be370b6a477b1dbc072856ad02bd"}, + {file = "greenlet-3.3.2-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a2a5be83a45ce6188c045bcc44b0ee037d6a518978de9a5d97438548b953a1ac"}, + {file = "greenlet-3.3.2-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ae9e21c84035c490506c17002f5c8ab25f980205c3e61ddb3a2a2a2e6c411fcb"}, + {file = "greenlet-3.3.2-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43e99d1749147ac21dde49b99c9abffcbc1e2d55c67501465ef0930d6e78e070"}, + {file = "greenlet-3.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4c956a19350e2c37f2c48b336a3afb4bff120b36076d9d7fb68cb44e05d95b79"}, + {file = "greenlet-3.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6c6f8ba97d17a1e7d664151284cb3315fc5f8353e75221ed4324f84eb162b395"}, + {file = "greenlet-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:34308836d8370bddadb41f5a7ce96879b72e2fdfb4e87729330c6ab52376409f"}, + {file = "greenlet-3.3.2-cp312-cp312-win_arm64.whl", hash = "sha256:d3a62fa76a32b462a97198e4c9e99afb9ab375115e74e9a83ce180e7a496f643"}, + {file = "greenlet-3.3.2-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:aa6ac98bdfd716a749b84d4034486863fd81c3abde9aa3cf8eff9127981a4ae4"}, + {file = "greenlet-3.3.2-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ab0c7e7901a00bc0a7284907273dc165b32e0d109a6713babd04471327ff7986"}, + {file = "greenlet-3.3.2-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d248d8c23c67d2291ffd47af766e2a3aa9fa1c6703155c099feb11f526c63a92"}, + {file = "greenlet-3.3.2-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ccd21bb86944ca9be6d967cf7691e658e43417782bce90b5d2faeda0ff78a7dd"}, + {file = "greenlet-3.3.2-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b6997d360a4e6a4e936c0f9625b1c20416b8a0ea18a8e19cabbefc712e7397ab"}, + {file = "greenlet-3.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:64970c33a50551c7c50491671265d8954046cb6e8e2999aacdd60e439b70418a"}, + {file = "greenlet-3.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1a9172f5bf6bd88e6ba5a84e0a68afeac9dc7b6b412b245dd64f52d83c81e55b"}, + {file = "greenlet-3.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:a7945dd0eab63ded0a48e4dcade82939783c172290a7903ebde9e184333ca124"}, + {file = "greenlet-3.3.2-cp313-cp313-win_arm64.whl", hash = "sha256:394ead29063ee3515b4e775216cb756b2e3b4a7e55ae8fd884f17fa579e6b327"}, + {file = "greenlet-3.3.2-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:8d1658d7291f9859beed69a776c10822a0a799bc4bfe1bd4272bb60e62507dab"}, + {file = "greenlet-3.3.2-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:18cb1b7337bca281915b3c5d5ae19f4e76d35e1df80f4ad3c1a7be91fadf1082"}, + {file = "greenlet-3.3.2-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c2e47408e8ce1c6f1ceea0dffcdf6ebb85cc09e55c7af407c99f1112016e45e9"}, + {file = "greenlet-3.3.2-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e3cb43ce200f59483eb82949bf1835a99cf43d7571e900d7c8d5c62cdf25d2f9"}, + {file = "greenlet-3.3.2-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63d10328839d1973e5ba35e98cccbca71b232b14051fd957b6f8b6e8e80d0506"}, + {file = "greenlet-3.3.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8e4ab3cfb02993c8cc248ea73d7dae6cec0253e9afa311c9b37e603ca9fad2ce"}, + {file = "greenlet-3.3.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:94ad81f0fd3c0c0681a018a976e5c2bd2ca2d9d94895f23e7bb1af4e8af4e2d5"}, + {file = "greenlet-3.3.2-cp314-cp314-win_amd64.whl", hash = "sha256:8c4dd0f3997cf2512f7601563cc90dfb8957c0cff1e3a1b23991d4ea1776c492"}, + {file = "greenlet-3.3.2-cp314-cp314-win_arm64.whl", hash = "sha256:cd6f9e2bbd46321ba3bbb4c8a15794d32960e3b0ae2cc4d49a1a53d314805d71"}, + {file = "greenlet-3.3.2-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:e26e72bec7ab387ac80caa7496e0f908ff954f31065b0ffc1f8ecb1338b11b54"}, + {file = "greenlet-3.3.2-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b466dff7a4ffda6ca975979bab80bdadde979e29fc947ac3be4451428d8b0e4"}, + {file = "greenlet-3.3.2-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b8bddc5b73c9720bea487b3bffdb1840fe4e3656fba3bd40aa1489e9f37877ff"}, + {file = "greenlet-3.3.2-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:59b3e2c40f6706b05a9cd299c836c6aa2378cabe25d021acd80f13abf81181cf"}, + {file = "greenlet-3.3.2-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b26b0f4428b871a751968285a1ac9648944cea09807177ac639b030bddebcea4"}, + {file = "greenlet-3.3.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1fb39a11ee2e4d94be9a76671482be9398560955c9e568550de0224e41104727"}, + {file = "greenlet-3.3.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:20154044d9085151bc309e7689d6f7ba10027f8f5a8c0676ad398b951913d89e"}, + {file = "greenlet-3.3.2-cp314-cp314t-win_amd64.whl", hash = "sha256:c04c5e06ec3e022cbfe2cd4a846e1d4e50087444f875ff6d2c2ad8445495cf1a"}, + {file = "greenlet-3.3.2.tar.gz", hash = "sha256:2eaf067fc6d886931c7962e8c6bede15d2f01965560f3359b27c80bde2d151f2"}, +] + +[package.extras] +docs = ["Sphinx", "furo"] +test = ["objgraph", "psutil", "setuptools"] + [[package]] name = "h11" version = "0.14.0" @@ -4156,6 +4241,108 @@ lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] standalone = ["Sphinx (>=5)"] test = ["pytest"] +[[package]] +name = "sqlalchemy" +version = "2.0.48" +description = "Database Abstraction Library" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "sqlalchemy-2.0.48-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7001dc9d5f6bb4deb756d5928eaefe1930f6f4179da3924cbd95ee0e9f4dce89"}, + {file = "sqlalchemy-2.0.48-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1a89ce07ad2d4b8cfc30bd5889ec40613e028ed80ef47da7d9dd2ce969ad30e0"}, + {file = "sqlalchemy-2.0.48-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10853a53a4a00417a00913d270dddda75815fcb80675874285f41051c094d7dd"}, + {file = "sqlalchemy-2.0.48-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:fac0fa4e4f55f118fd87177dacb1c6522fe39c28d498d259014020fec9164c29"}, + {file = "sqlalchemy-2.0.48-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3713e21ea67bca727eecd4a24bf68bcd414c403faae4989442be60994301ded0"}, + {file = "sqlalchemy-2.0.48-cp310-cp310-win32.whl", hash = "sha256:d404dc897ce10e565d647795861762aa2d06ca3f4a728c5e9a835096c7059018"}, + {file = "sqlalchemy-2.0.48-cp310-cp310-win_amd64.whl", hash = "sha256:841a94c66577661c1f088ac958cd767d7c9bf507698f45afffe7a4017049de76"}, + {file = "sqlalchemy-2.0.48-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b4c575df7368b3b13e0cebf01d4679f9a28ed2ae6c1cd0b1d5beffb6b2007dc"}, + {file = "sqlalchemy-2.0.48-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e83e3f959aaa1c9df95c22c528096d94848a1bc819f5d0ebf7ee3df0ca63db6c"}, + {file = "sqlalchemy-2.0.48-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6f7b7243850edd0b8b97043f04748f31de50cf426e939def5c16bedb540698f7"}, + {file = "sqlalchemy-2.0.48-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:82745b03b4043e04600a6b665cb98697c4339b24e34d74b0a2ac0a2488b6f94d"}, + {file = "sqlalchemy-2.0.48-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e5e088bf43f6ee6fec7dbf1ef7ff7774a616c236b5c0cb3e00662dd71a56b571"}, + {file = "sqlalchemy-2.0.48-cp311-cp311-win32.whl", hash = "sha256:9c7d0a77e36b5f4b01ca398482230ab792061d243d715299b44a0b55c89fe617"}, + {file = "sqlalchemy-2.0.48-cp311-cp311-win_amd64.whl", hash = "sha256:583849c743e0e3c9bb7446f5b5addeacedc168d657a69b418063dfdb2d90081c"}, + {file = "sqlalchemy-2.0.48-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:348174f228b99f33ca1f773e85510e08927620caa59ffe7803b37170df30332b"}, + {file = "sqlalchemy-2.0.48-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53667b5f668991e279d21f94ccfa6e45b4e3f4500e7591ae59a8012d0f010dcb"}, + {file = "sqlalchemy-2.0.48-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:34634e196f620c7a61d18d5cf7dc841ca6daa7961aed75d532b7e58b309ac894"}, + {file = "sqlalchemy-2.0.48-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:546572a1793cc35857a2ffa1fe0e58571af1779bcc1ffa7c9fb0839885ed69a9"}, + {file = "sqlalchemy-2.0.48-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:07edba08061bc277bfdc772dd2a1a43978f5a45994dd3ede26391b405c15221e"}, + {file = "sqlalchemy-2.0.48-cp312-cp312-win32.whl", hash = "sha256:908a3fa6908716f803b86896a09a2c4dde5f5ce2bb07aacc71ffebb57986ce99"}, + {file = "sqlalchemy-2.0.48-cp312-cp312-win_amd64.whl", hash = "sha256:68549c403f79a8e25984376480959975212a670405e3913830614432b5daa07a"}, + {file = "sqlalchemy-2.0.48-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e3070c03701037aa418b55d36532ecb8f8446ed0135acb71c678dbdf12f5b6e4"}, + {file = "sqlalchemy-2.0.48-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2645b7d8a738763b664a12a1542c89c940daa55196e8d73e55b169cc5c99f65f"}, + {file = "sqlalchemy-2.0.48-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b19151e76620a412c2ac1c6f977ab1b9fa7ad43140178345136456d5265b32ed"}, + {file = "sqlalchemy-2.0.48-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5b193a7e29fd9fa56e502920dca47dffe60f97c863494946bd698c6058a55658"}, + {file = "sqlalchemy-2.0.48-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:36ac4ddc3d33e852da9cb00ffb08cea62ca05c39711dc67062ca2bb1fae35fd8"}, + {file = "sqlalchemy-2.0.48-cp313-cp313-win32.whl", hash = "sha256:389b984139278f97757ea9b08993e7b9d1142912e046ab7d82b3fbaeb0209131"}, + {file = "sqlalchemy-2.0.48-cp313-cp313-win_amd64.whl", hash = "sha256:d612c976cbc2d17edfcc4c006874b764e85e990c29ce9bd411f926bbfb02b9a2"}, + {file = "sqlalchemy-2.0.48-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:69f5bc24904d3bc3640961cddd2523e361257ef68585d6e364166dfbe8c78fae"}, + {file = "sqlalchemy-2.0.48-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fd08b90d211c086181caed76931ecfa2bdfc83eea3cfccdb0f82abc6c4b876cb"}, + {file = "sqlalchemy-2.0.48-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:1ccd42229aaac2df431562117ac7e667d702e8e44afdb6cf0e50fa3f18160f0b"}, + {file = "sqlalchemy-2.0.48-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f0dcbc588cd5b725162c076eb9119342f6579c7f7f55057bb7e3c6ff27e13121"}, + {file = "sqlalchemy-2.0.48-cp313-cp313t-win32.whl", hash = "sha256:9764014ef5e58aab76220c5664abb5d47d5bc858d9debf821e55cfdd0f128485"}, + {file = "sqlalchemy-2.0.48-cp313-cp313t-win_amd64.whl", hash = "sha256:e2f35b4cccd9ed286ad62e0a3c3ac21e06c02abc60e20aa51a3e305a30f5fa79"}, + {file = "sqlalchemy-2.0.48-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:e2d0d88686e3d35a76f3e15a34e8c12d73fc94c1dea1cd55782e695cc14086dd"}, + {file = "sqlalchemy-2.0.48-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49b7bddc1eebf011ea5ab722fdbe67a401caa34a350d278cc7733c0e88fecb1f"}, + {file = "sqlalchemy-2.0.48-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:426c5ca86415d9b8945c7073597e10de9644802e2ff502b8e1f11a7a2642856b"}, + {file = "sqlalchemy-2.0.48-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:288937433bd44e3990e7da2402fabc44a3c6c25d3704da066b85b89a85474ae0"}, + {file = "sqlalchemy-2.0.48-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8183dc57ae7d9edc1346e007e840a9f3d6aa7b7f165203a99e16f447150140d2"}, + {file = "sqlalchemy-2.0.48-cp314-cp314-win32.whl", hash = "sha256:1182437cb2d97988cfea04cf6cdc0b0bb9c74f4d56ec3d08b81e23d621a28cc6"}, + {file = "sqlalchemy-2.0.48-cp314-cp314-win_amd64.whl", hash = "sha256:144921da96c08feb9e2b052c5c5c1d0d151a292c6135623c6b2c041f2a45f9e0"}, + {file = "sqlalchemy-2.0.48-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5aee45fd2c6c0f2b9cdddf48c48535e7471e42d6fb81adfde801da0bd5b93241"}, + {file = "sqlalchemy-2.0.48-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7cddca31edf8b0653090cbb54562ca027c421c58ddde2c0685f49ff56a1690e0"}, + {file = "sqlalchemy-2.0.48-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7a936f1bb23d370b7c8cc079d5fce4c7d18da87a33c6744e51a93b0f9e97e9b3"}, + {file = "sqlalchemy-2.0.48-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e004aa9248e8cb0a5f9b96d003ca7c1c0a5da8decd1066e7b53f59eb8ce7c62b"}, + {file = "sqlalchemy-2.0.48-cp314-cp314t-win32.whl", hash = "sha256:b8438ec5594980d405251451c5b7ea9aa58dda38eb7ac35fb7e4c696712ee24f"}, + {file = "sqlalchemy-2.0.48-cp314-cp314t-win_amd64.whl", hash = "sha256:d854b3970067297f3a7fbd7a4683587134aa9b3877ee15aa29eea478dc68f933"}, + {file = "sqlalchemy-2.0.48-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f8649a14caa5f8a243628b1d61cf530ad9ae4578814ba726816adb1121fc493e"}, + {file = "sqlalchemy-2.0.48-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6bb85c546591569558571aa1b06aba711b26ae62f111e15e56136d69920e1616"}, + {file = "sqlalchemy-2.0.48-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6b764fb312bd35e47797ad2e63f0d323792837a6ac785a4ca967019357d2bc7"}, + {file = "sqlalchemy-2.0.48-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:7c998f2ace8bf76b453b75dbcca500d4f4b9dd3908c13e89b86289b37784848b"}, + {file = "sqlalchemy-2.0.48-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:d64177f443594c8697369c10e4bbcac70ef558e0f7921a1de7e4a3d1734bcf67"}, + {file = "sqlalchemy-2.0.48-cp38-cp38-win32.whl", hash = "sha256:01f6bbd4308b23240cf7d3ef117557c8fd097ec9549d5d8a52977544e35b40ad"}, + {file = "sqlalchemy-2.0.48-cp38-cp38-win_amd64.whl", hash = "sha256:858e433f12b0e5b3ed2f8da917433b634f4937d0e8793e5cb33c54a1a01df565"}, + {file = "sqlalchemy-2.0.48-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4599a95f9430ae0de82b52ff0d27304fe898c17cb5f4099f7438a51b9998ac77"}, + {file = "sqlalchemy-2.0.48-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f27f9da0a7d22b9f981108fd4b62f8b5743423388915a563e651c20d06c1f457"}, + {file = "sqlalchemy-2.0.48-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d8fcccbbc0c13c13702c471da398b8cd72ba740dca5859f148ae8e0e8e0d3e7e"}, + {file = "sqlalchemy-2.0.48-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a5b429eb84339f9f05e06083f119ad814e6d85e27ecbdf9c551dfdbb128eaf8a"}, + {file = "sqlalchemy-2.0.48-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:bcb8ebbf2e2c36cfe01a94f2438012c6a9d494cf80f129d9753bcdf33bfc35a6"}, + {file = "sqlalchemy-2.0.48-cp39-cp39-win32.whl", hash = "sha256:e214d546c8ecb5fc22d6e6011746082abf13a9cf46eefb45769c7b31407c97b5"}, + {file = "sqlalchemy-2.0.48-cp39-cp39-win_amd64.whl", hash = "sha256:b8fc3454b4f3bd0a368001d0e968852dad45a873f8b4babd41bc302ec851a099"}, + {file = "sqlalchemy-2.0.48-py3-none-any.whl", hash = "sha256:a66fe406437dd65cacd96a72689a3aaaecaebbcd62d81c5ac1c0fdbeac835096"}, + {file = "sqlalchemy-2.0.48.tar.gz", hash = "sha256:5ca74f37f3369b45e1f6b7b06afb182af1fd5dde009e4ffd831830d98cbe5fe7"}, +] + +[package.dependencies] +greenlet = {version = ">=1", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""} +typing-extensions = ">=4.6.0" + +[package.extras] +aiomysql = ["aiomysql (>=0.2.0)", "greenlet (>=1)"] +aioodbc = ["aioodbc", "greenlet (>=1)"] +aiosqlite = ["aiosqlite", "greenlet (>=1)", "typing_extensions (!=3.10.0.1)"] +asyncio = ["greenlet (>=1)"] +asyncmy = ["asyncmy (>=0.2.3,!=0.2.4,!=0.2.6)", "greenlet (>=1)"] +mariadb-connector = ["mariadb (>=1.0.1,!=1.1.2,!=1.1.5,!=1.1.10)"] +mssql = ["pyodbc"] +mssql-pymssql = ["pymssql"] +mssql-pyodbc = ["pyodbc"] +mypy = ["mypy (>=0.910)"] +mysql = ["mysqlclient (>=1.4.0)"] +mysql-connector = ["mysql-connector-python"] +oracle = ["cx_oracle (>=8)"] +oracle-oracledb = ["oracledb (>=1.0.1)"] +postgresql = ["psycopg2 (>=2.7)"] +postgresql-asyncpg = ["asyncpg", "greenlet (>=1)"] +postgresql-pg8000 = ["pg8000 (>=1.29.1)"] +postgresql-psycopg = ["psycopg (>=3.0.7)"] +postgresql-psycopg2binary = ["psycopg2-binary"] +postgresql-psycopg2cffi = ["psycopg2cffi"] +postgresql-psycopgbinary = ["psycopg[binary] (>=3.0.7)"] +pymysql = ["pymysql"] +sqlcipher = ["sqlcipher3_binary"] + [[package]] name = "stack-data" version = "0.6.3" @@ -4595,4 +4782,4 @@ preprocessing = [] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.14" -content-hash = "4b551ecb1dddda94c2ea6579463188e6aa0ab5da486b63dd8bed941ce9a4d7db" +content-hash = "47cffe061807056ea49f027be88f4c848bd92c22bd4f45054d5b0b3896ae2e87" diff --git a/pyproject.toml b/pyproject.toml index 781738c0..f9504e76 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,8 @@ humanize = "^4.8.0" typing-extensions = ">=4.10.0" pydantic = "^2.12.5" duckdb = "^1.4.4" +duckdb-engine = "^0.17.0" +sqlalchemy = "^2.0.48" [tool.poetry.group.dev.dependencies] pytest = ">=6.1.0" diff --git a/pysus/__init__.py b/pysus/__init__.py index 1cfef31f..1d64ab45 100644 --- a/pysus/__init__.py +++ b/pysus/__init__.py @@ -1,8 +1,16 @@ -# type: ignore[attr-defined] """PySUS Python package""" +import os +import pathlib +from typing import Final from importlib import metadata as importlib_metadata + +CACHEPATH: Final[str] = os.getenv( + "PYSUS_CACHEPATH", + os.path.join(str(pathlib.Path.home()), "pysus"), +) + from pysus.api.ftp.databases import * # noqa @@ -10,14 +18,8 @@ def get_version() -> str: try: return importlib_metadata.version(__name__) except importlib_metadata.PackageNotFoundError: # pragma: no cover - return "1.0.1" # changed by semantic-release + return "1.0.1" version: str = get_version() __version__: str = version - -__all__ = [ - "AVAILABLE_DATABASES", - "version", - "__version__", -] diff --git a/pysus/api/ducklake/catalog/models.py b/pysus/api/__init__.py similarity index 100% rename from pysus/api/ducklake/catalog/models.py rename to pysus/api/__init__.py diff --git a/pysus/api/dadosgov/models.py b/pysus/api/dadosgov/models.py index 149cb0fb..407e3560 100644 --- a/pysus/api/dadosgov/models.py +++ b/pysus/api/dadosgov/models.py @@ -1,6 +1,10 @@ -from pydantic import BaseModel, Field, BeforeValidator +import requests +from pathlib import Path from datetime import datetime as dt -from typing import Optional, List, Any, Annotated +from typing import Optional, List, Any, Annotated, Union +from pydantic import BaseModel, Field, BeforeValidator + +from pysus import CACHEPATH def to_datetime(value: Any) -> Optional[dt]: @@ -30,6 +34,9 @@ class Tag(BaseModel): name: str display_name: Optional[str] = None + def __str__(self): + return self.name + class Resource(BaseModel): id: str @@ -38,14 +45,37 @@ class Resource(BaseModel): url: str = Field(alias="link") format: str = Field(alias="formato") size: int = Field(alias="tamanho") - cataloging_date: DateTime = Field(None, alias="dataCatalogacao") - last_modified: DateTime = Field(None, alias="dataUltimaAtualizacaoArquivo") + cataloging_date: Optional[str] = Field(None, alias="dataCatalogacao") + last_modified: Optional[str] = Field( + None, + alias="dataUltimaAtualizacaoArquivo", + ) download_count: Optional[int] = Field(None, alias="quantidadeDownloads") file_name: Optional[str] = Field(None, alias="nomeArquivo") resource_type: Optional[str] = Field(None, alias="tipo") order_number: Optional[int] = Field(None, alias="numOrdem") dataset_id: Optional[str] = Field(None, alias="idConjuntoDados") + def __str__(self): + return self.file_name + + def download(self, target_dir: Union[str, Path] = CACHEPATH) -> Path: + target_path = Path(target_dir) + target_path.mkdir(parents=True, exist_ok=True) + + output_file = target_path / ( + self.file_name or f"{self.id}.{self.format.lower()}" + ) + + response = requests.get(self.url, stream=True) + response.raise_for_status() + + with open(output_file, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + return output_file + class DatasetDetail(BaseModel): id: str @@ -63,7 +93,8 @@ class DatasetDetail(BaseModel): is_open_data: Bool = Field(alias="dadosAbertos") is_discontinued: Bool = Field(alias="descontinuado") is_private: Bool = Field(False, alias="privado") - metadata_updated: DateTime = Field(None, alias="dataUltimaAtualizacaoMetadados") + metadata_updated: DateTime = Field( + None, alias="dataUltimaAtualizacaoMetadados") file_updated: DateTime = Field(None, alias="dataUltimaAtualizacaoArquivo") cataloging_date: DateTime = Field(None, alias="dataCatalogacao") visibility: str = Field(alias="visibilidade") @@ -71,6 +102,9 @@ class DatasetDetail(BaseModel): seal: Optional[str] = Field(None, alias="selo") source: Optional[str] = Field(None, alias="origemCadastro") + def __str__(self): + return self.id + class DatasetSummary(BaseModel): id: str @@ -81,3 +115,6 @@ class DatasetSummary(BaseModel): cataloging_date: DateTime = Field(None, alias="catalogacao") metadata_modified: DateTime = Field(None, alias="ultimaAlteracaoMetadados") last_update: DateTime = Field(None, alias="ultimaAtualizacaoDados") + + def __str__(self): + return self.name diff --git a/pysus/api/ducklake/catalog.py b/pysus/api/ducklake/catalog.py new file mode 100644 index 00000000..e69de29b diff --git a/pysus/api/ducklake/client.py b/pysus/api/ducklake/client.py index e69de29b..6818ce3b 100644 --- a/pysus/api/ducklake/client.py +++ b/pysus/api/ducklake/client.py @@ -0,0 +1,56 @@ +import requests +from pathlib import Path + +import duckdb + +from pysus import CACHEPATH + + +class DuckLake: + def __init__(self): + self.endpoint = "nbg1.your-objectstorage.com" + self.remote_url = f"https://{self.endpoint}/pysus/public/catalog.db" + self.cache_dir = Path(CACHEPATH) / "ducklake" + self.cache_dir.mkdir(parents=True, exist_ok=True) + self.catalog_local = self.cache_dir / "catalog.db" + self._ensure_catalog() + self.con = self._connect() + + def _remote_size(self): + r = requests.head(self.remote_url) + r.raise_for_status() + return int(r.headers.get("content-length", 0)) + + def _local_size(self): + if not self.catalog_local.exists(): + return None + return self.catalog_local.stat().st_size + + def _download_catalog(self): + r = requests.get(self.remote_url, stream=True) + r.raise_for_status() + with open(self.catalog_local, "wb") as f: + for chunk in r.iter_content(chunk_size=1024 * 1024): + f.write(chunk) + + def _ensure_catalog(self): + if self._remote_size() != self._local_size(): + self._download_catalog() + + def _connect(self): + con = duckdb.connect() + con.execute( + f""" + SET s3_endpoint='{self.endpoint}'; + SET s3_region='nbg1'; + SET s3_url_style='path'; + SET s3_use_ssl=true; + """ + ) + con.execute( + f""" + ATTACH 'ducklake:{self.catalog_local}' AS pysus; + USE pysus; + """ + ) + return con diff --git a/pysus/api/ducklake/models.py b/pysus/api/ducklake/models.py new file mode 100644 index 00000000..54d6850c --- /dev/null +++ b/pysus/api/ducklake/models.py @@ -0,0 +1,167 @@ +import enum + +from sqlalchemy.orm import declarative_base, relationship +from sqlalchemy import ( + Column, + Integer, + String, + ForeignKey, + Date, + Boolean, + Index, + Enum, +) + +Base = declarative_base() + + +class Catalog(Base): + __abstract__ = True + __table_args__ = {"schema": "pysus"} + + +class Dataset(Catalog): + __tablename__ = "datasets" + + id = Column(Integer, primary_key=True) + name = Column(String, nullable=False, unique=True, index=True) + metadata_id = Column( + Integer, + ForeignKey("pysus.dataset_metadata.id"), + index=True, + ) + + dataset_metadata = relationship( + "DatasetMetadata", + back_populates="datasets", + ) + + groups = relationship( + "DatasetGroup", + back_populates="dataset", + cascade="all, delete-orphan", + ) + + columns = relationship( + "ColumnDefinition", + back_populates="dataset", + cascade="all, delete-orphan", + ) + + +class ColumnDefinition(Catalog): + __tablename__ = "dataset_columns" + + id = Column(Integer, primary_key=True) + dataset_id = Column( + Integer, + ForeignKey("pysus.datasets.id"), + nullable=False, + index=True, + ) + name = Column(String, nullable=False) + type = Column(String, nullable=False) + description = Column(String, nullable=True) + nullable = Column(Boolean, nullable=False, default=True) + position = Column(Integer, nullable=False, index=True) + + dataset = relationship("Dataset", back_populates="columns") + + __table_args__ = ( + Index("ix_columns_dataset_name", "dataset_id", "name"), + {"schema": "pysus"}, + ) + + +class DatasetGroup(Catalog): + __tablename__ = "dataset_groups" + + id = Column(Integer, primary_key=True) + name = Column(String, nullable=False) + dataset_id = Column( + Integer, + ForeignKey("pysus.datasets.id"), + nullable=False, + index=True, + ) + metadata_id = Column( + Integer, + ForeignKey("pysus.dataset_group_metadata.id"), + index=True, + ) + + dataset = relationship( + "Dataset", + back_populates="groups", + ) + + group_metadata = relationship( + "DatasetGroupMetadata", + back_populates="groups", + ) + + files = relationship( + "File", + back_populates="group", + cascade="all, delete-orphan", + ) + + __table_args__ = ( + Index("ix_groups_dataset_name", "dataset_id", "name"), + {"schema": "pysus"}, + ) + + +class File(Catalog): + __tablename__ = "files" + + id = Column(Integer, primary_key=True) + + group_id = Column( + Integer, + ForeignKey("pysus.dataset_groups.id"), + nullable=False, + index=True, + ) + path = Column(String, nullable=False, unique=True) + size = Column(Integer, nullable=False) + rows = Column(Integer, nullable=False) + + modified = Column(Date, nullable=False) + + group = relationship( + "DatasetGroup", + back_populates="files", + ) + + +class DatasetMetadata(Catalog): + class Origin(enum.Enum): + FTP = "ftp" + API = "api" + + __tablename__ = "dataset_metadata" + + id = Column(Integer, primary_key=True) + long_name = Column(String, nullable=False) + description = Column(String, nullable=True) + source = Column(String, nullable=True) + origin = Column(Enum(Origin), nullable=False) + + datasets = relationship( + "Dataset", + back_populates="dataset_metadata", + ) + + +class DatasetGroupMetadata(Catalog): + __tablename__ = "dataset_group_metadata" + + id = Column(Integer, primary_key=True) + long_name = Column(String, nullable=False) + description = Column(String, nullable=True) + + groups = relationship( + "DatasetGroup", + back_populates="group_metadata", + ) diff --git a/pysus/api/ducklake/storage.py b/pysus/api/ducklake/storage.py new file mode 100644 index 00000000..e69de29b diff --git a/pysus/api/ftp/__init__.py b/pysus/api/ftp/__init__.py index 65944e50..852efe38 100644 --- a/pysus/api/ftp/__init__.py +++ b/pysus/api/ftp/__init__.py @@ -1,3 +1,28 @@ from .client import * # noqa from .databases import * # noqa + +AVAILABLE_DATABASES = [ + CIHA, + CNES, + IBGEDATASUS, + PNI, + SIA, + SIH, + SIM, + SINAN, + SINASC, +] + +__all__ = [ + "CIHA", + "CNES", + "IBGEDATASUS", + "PNI", + "SIA", + "SIH", + "SIM", + "SINAN", + "SINASC", + "AVAILABLE_DATABASES", +] diff --git a/pysus/api/ftp/client.py b/pysus/api/ftp/client.py index 453ed3f4..88c46e1b 100644 --- a/pysus/api/ftp/client.py +++ b/pysus/api/ftp/client.py @@ -1,6 +1,6 @@ from __future__ import annotations -__all__ = ["File", "Directory", "Database", "CACHEPATH"] +__all__ = ["File", "Directory", "Database"] import asyncio import os @@ -23,19 +23,18 @@ import humanize from aioftp import Client from loguru import logger -from pysus.data.local import Data -from pysus.utils import to_list from tqdm import tqdm from typing_extensions import Self +from pysus import CACHEPATH +from pysus.data.local import Data +from pysus.utils import to_list + # Type aliases PathLike = Union[str, pathlib.Path] FileContent = Dict[str, Union["Directory", "File"]] # Constants -CACHEPATH: Final[str] = os.getenv( - "PYSUS_CACHEPATH", os.path.join(str(pathlib.Path.home()), "pysus") -) __cachepath__: Final[pathlib.Path] = pathlib.Path(CACHEPATH) __cachepath__.mkdir(exist_ok=True) diff --git a/pysus/api/ftp/databases.py b/pysus/api/ftp/databases.py new file mode 100644 index 00000000..c2dcf47c --- /dev/null +++ b/pysus/api/ftp/databases.py @@ -0,0 +1,892 @@ +__all__ = [ + "CIHA", + "CNES", + "IBGEDATASUS", + "PNI", + "SIA", + "SIH", + "SIM", + "SINAN", + "SINASC", +] + +from typing import List, Optional, Union, Literal + +from pysus.api.ftp import Database, Directory, File +from pysus.utils import UFs, parse_UFs, to_list, zfill_year, MONTHS + + +class CIHA(Database): + name = "CIHA" + paths = (Directory("/dissemin/publicos/CIHA/201101_/Dados"),) + metadata = { + "long_name": "Comunicação de Internação Hospitalar e Ambulatorial", + "source": "http://ciha.datasus.gov.br/CIHA/index.php", + "description": ( + "A CIHA foi criada para ampliar o processo de planejamento, " + "programação, controle, avaliação e regulação da assistência à " + "saúde permitindo um conhecimento mais abrangente e profundo dos " + "perfis nosológico e epidemiológico da população brasileira, da " + "capacidade instalada e do potencial de produção de serviços do " + "conjunto de estabelecimentos de saúde do País. O sistema permite " + "o acompanhamento das ações e serviços de saúde custeados " + "por: planos privados de assistência à saúde; planos públicos; " + "pagamento particular por pessoa física; pagamento particular por " + "pessoa jurídica; programas e projetos federais (PRONON, PRONAS, " + "PROADI); recursos próprios das secretarias municipais e estaduais" + " de saúde; DPVAT; gratuidade e, a partir da publicação da " + "Portaria GM/MS nº 2.905/2022, consórcios públicos. As " + "informações registradas na CIHA servem como base para o processo " + "de Certificação de Entidades Beneficentes de Assistência Social " + "em Saúde (CEBAS) e para monitoramento dos programas PRONAS e " + "PRONON" + ), + } + groups = { + "CIHA": "Comunicação de Internação Hospitalar e Ambulatorial", + } + + def describe(self, file: File): + if not isinstance(file, File): + return file + + if file.extension.upper() in [".DBC", ".DBF"]: + group, _uf, year, month = self.format(file) + + try: + uf = UFs[_uf] + except KeyError: + uf = _uf + + description = { + "name": str(file.basename), + "group": self.groups[group], + "uf": uf, + "month": MONTHS[int(month)], + "year": zfill_year(year), + "size": file.info["size"], + "last_update": file.info["modify"], + } + + return description + return file + + def format(self, file: File) -> tuple: + group, _uf = file.name[:4].upper(), file.name[4:6].upper() + year, month = file.name[-4:-2], file.name[-2:] + return group, _uf, zfill_year(year), month + + def get_files( + self, + uf: Optional[Union[List[str], str]] = None, + year: Optional[Union[list, str, int]] = None, + month: Optional[Union[list, str, int]] = None, + group: Union[List[str], str] = "CIHA", + ) -> List[File]: + files = list( + filter(lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files) + ) + + groups = [gr.upper() for gr in to_list(group)] + + if not all(gr in list(self.groups) for gr in groups): + raise ValueError( + f"Unknown CIHA Group(s): {set(groups).difference(list(self.groups))}" + ) + + files = list(filter(lambda f: self.format(f)[0] in groups, files)) + + if uf: + ufs = parse_UFs(uf) + files = list(filter(lambda f: self.format(f)[1] in ufs, files)) + + if year or str(year) in ["0", "00"]: + years = [zfill_year(str(m)[-2:]) for m in to_list(year)] + files = list(filter(lambda f: self.format(f)[2] in years, files)) + + if month: + months = [str(y)[-2:].zfill(2) for y in to_list(month)] + files = list(filter(lambda f: self.format(f)[3] in months, files)) + + return files + + +class CNES(Database): + name = "CNES" + paths = (Directory("/dissemin/publicos/CNES/200508_/Dados"),) + metadata = { + "long_name": "Cadastro Nacional de Estabelecimentos de Saúde", + "source": "https://cnes.datasus.gov.br/", + "description": ( + "O Cadastro Nacional de Estabelecimentos de Saúde (CNES) é o " + "sistema de informação oficial de cadastramento de informações " + "de todos os estabelecimentos de saúde no país, independentemente " + "de sua natureza jurídica ou de integrarem o Sistema Único de " + "Saúde (SUS). Trata-se do cadastro oficial do Ministério da " + "Saúde (MS) no tocante à realidade da capacidade instalada e " + "mão-de-obra assistencial de saúde no Brasil em estabelecimentos " + "de saúde públicos ou privados, com convênio SUS ou não." + ), + } + groups = { + "DC": "Dados Complementares", + "EE": "Estabelecimento de Ensino", + "EF": "Estabelecimento Filantrópico", + "EP": "Equipes", + "EQ": "Equipamentos", + "GM": "Gestão e Metas", + "HB": "Habilitação", + "IN": "Incentivos", + "LT": "Leitos", + "PF": "Profissional", + "RC": "Regra Contratual", + "SR": "Serviço Especializado", + "ST": "Estabelecimentos", + } + __loaded__ = set() + + def load( + self, + groups: Union[str, List[str]] = None, + ): + """ + Loads CNES Groups into content. Will convert the files and directories + found within FTP Directories into self.content + """ + if not self.__content__: + self.paths[0].load() + self.__content__ |= self.paths[0].__content__ + + if groups: + groups = to_list(groups) + + if not all(group in self.groups for group in [gr.upper() for gr in groups]): + raise ValueError( + f"Unknown CNES group(s): {set(groups).difference(self.groups)}" + ) + + for group in groups: + group = group.upper() + if group not in self.__loaded__: + directory = self.__content__[group] + directory.load() + self.__content__ |= directory.__content__ + self.__loaded__.add(directory.name) + return self + + def describe(self, file: File) -> dict: + if not isinstance(file, File): + return {} + + if file.name == "GMufAAmm": + # Leftover + return {} + + if file.extension.upper() in [".DBC", ".DBF"]: + group, _uf, year, month = self.format(file) + + try: + uf = UFs[_uf] + except KeyError: + uf = _uf + + description = { + "name": str(file.basename), + "group": self.groups[group], + "uf": uf, + "month": MONTHS[int(month)], + "year": zfill_year(year), + "size": file.info["size"], + "last_update": file.info["modify"], + } + + return description + return {} + + def format(self, file: File) -> tuple: + group, _uf = file.name[:2].upper(), file.name[2:4].upper() + year, month = file.name[-4:-2], file.name[-2:] + return group, _uf, zfill_year(year), month + + def get_files( + self, + group: Union[List[str], str], + uf: Optional[Union[List[str], str]] = None, + year: Optional[Union[list, str, int]] = None, + month: Optional[Union[list, str, int]] = None, + ) -> List[File]: + if not group: + raise ValueError("At least one CNES group is required") + + groups = [gr.upper() for gr in to_list(group)] + + self.load(groups) + + files = list(filter(lambda f: f.name[:2] in groups, self.files)) + + if uf: + ufs = parse_UFs(uf) + files = list(filter(lambda f: f.name[2:4] in ufs, files)) + + if year or str(year) in ["0", "00"]: + years = [str(m)[-2:].zfill(2) for m in to_list(year)] + files = list(filter(lambda f: f.name[-4:-2] in years, files)) + + if month: + months = [str(y)[-2:].zfill(2) for y in to_list(month)] + files = list(filter(lambda f: f.name[-2:] in months, files)) + + return files + + +class IBGEDATASUS(Database): + name = "IBGE-DataSUS" + paths = ( + Directory("/dissemin/publicos/IBGE/POP"), + Directory("/dissemin/publicos/IBGE/censo"), + Directory("/dissemin/publicos/IBGE/POPTCU"), + Directory("/dissemin/publicos/IBGE/projpop"), + # Directory("/dissemin/publicos/IBGE/Auxiliar") # this has a different file name pattern # noqa + ) + metadata = { + "long_name": "Populaçao Residente, Censos, Contagens " + "Populacionais e Projeçoes Intercensitarias", + "source": "ftp://ftp.datasus.gov.br/dissemin/publicos/IBGE", + "description": ( + "São aqui apresentados informações sobre a população residente, " + "estratificadas por município, faixas etárias e sexo, obtidas a " + "partir dos Censos Demográficos, Contagens Populacionais " + "e Projeções Intercensitárias." + ), + } + + def describe(self, file: File) -> dict: + if file.extension.upper() in [".ZIP"]: + year = file.name.split(".")[0][-2:] + description = { + "name": str(file.basename), + "year": zfill_year(year), + "size": file.info["size"], + "last_update": file.info["modify"], + } + return description + elif file.extension.upper() == ".DBF": + year = file.name[-2:] + description = { + "name": str(file.basename), + "year": zfill_year(year), + "size": file.info["size"], + "last_update": file.info["modify"], + } + return description + return {} + + def format(self, file: File) -> tuple: + return (file.name[-2:],) + + def get_files( + self, + source: Literal["POP", "censo", "POPTCU", "projpop"] = "POPTCU", + year: Optional[Union[str, int, list]] = None, + *args, + **kwargs, + ) -> List[File]: + sources = ["POP", "censo", "POPTCU", "projpop"] + source_dir = None + + for dir in self.paths: + if source in sources and source in dir.path: + source_dir = dir + + if not source_dir: + raise ValueError(f"Unkown source {source}. Options: {sources}") + + files = source_dir.content + + if year: + if isinstance(year, (str, int)): + files = [ + f for f in files if self.describe(f)["year"] == zfill_year(year) + ] + elif isinstance(year, list): + files = [ + f + for f in files + if str(self.describe(f)["year"]) + in [str(zfill_year(y)) for y in year] + ] + + return files + + +class PNI(Database): + name = "PNI" + paths = (Directory("/dissemin/publicos/PNI/DADOS"),) + metadata = { + "long_name": ("Sistema de Informações do Programa Nacional de Imunizações"), + "source": ( + "https://datasus.saude.gov.br/acesso-a-informacao/morbidade-hospitalar-do-sus-sih-sus/", # noqa + "https://datasus.saude.gov.br/acesso-a-informacao/producao-hospitalar-sih-sus/", # noqa + ), + "description": ( + "O SI-PNI é um sistema desenvolvido para possibilitar aos " + "gestores envolvidos no Programa Nacional de Imunização, a " + "avaliação dinâmica do risco quanto à ocorrência de surtos ou " + "epidemias, a partir do registro dos imunobiológicos aplicados e " + "do quantitativo populacional vacinado, agregados por faixa " + "etária, período de tempo e área geográfica. Possibilita também " + "o controle do estoque de imunobiológicos necessário aos " + "administradores que têm a incumbência de programar sua aquisição " + "e distribuição. Controla as indicações de aplicação de " + "vacinas de imunobiológicos especiais e seus eventos adversos, " + "dentro dos Centros de Referências em imunobiológicos especiais." + ), + } + groups = { + "CPNI": "Cobertura Vacinal", # TODO: may be incorrect + "DPNI": "Doses Aplicadas", # TODO: may be incorrect + } + + def describe(self, file: File) -> dict: + if file.extension.upper() in [".DBC", ".DBF"]: + group, _uf, year = self.format(file) + + try: + uf = UFs[_uf] + except KeyError: + uf = _uf + + description = { + "name": file.basename, + "group": self.groups[group], + "uf": uf, + "year": zfill_year(year), + "size": file.info["size"], + "last_update": file.info["modify"], + } + + return description + return {} + + def format(self, file: File) -> tuple: + if len(file.name) != 8: + raise ValueError(f"Can't format {file.name}") + + n = file.name + group, _uf, year = n[:4], n[4:6], n[-2:] + return group, _uf, zfill_year(year) + + def get_files( + self, + group: Union[list, Literal["CNPI", "DPNI"]], + uf: Optional[Union[List[str], str]] = None, + year: Optional[Union[list, str, int]] = None, + ) -> List[File]: + files = list( + filter(lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files) + ) + + groups = [gr.upper() for gr in to_list(group)] + + if not all(gr in list(self.groups) for gr in groups): + raise ValueError( + f"Unknown PNI Group(s): {set(groups).difference(list(self.groups))}" + ) + + files = list(filter(lambda f: self.format(f)[0] in groups, files)) + + if uf: + ufs = parse_UFs(uf) + files = list(filter(lambda f: self.format(f)[1] in ufs, files)) + + if year or str(year) in ["0", "00"]: + years = [zfill_year(str(m)[-2:]) for m in to_list(year)] + files = list(filter(lambda f: self.format(f)[2] in years, files)) + + return files + + +class SIA(Database): + name = "SIA" + paths = ( + Directory("/dissemin/publicos/SIASUS/199407_200712/Dados"), + Directory("/dissemin/publicos/SIASUS/200801_/Dados"), + ) + metadata = { + "long_name": "Sistema de Informações Ambulatoriais", + "source": "http://sia.datasus.gov.br/principal/index.php", + "description": ( + "O Sistema de Informação Ambulatorial (SIA) foi instituído pela " + "Portaria GM/MS n.º 896 de 29 de junho de 1990. Originalmente, o " + "SIA foi concebido a partir do projeto SICAPS (Sistema de " + "Informação e Controle Ambulatorial da Previdência Social), em " + "que os conceitos, os objetivos e as diretrizes criados para o " + "desenvolvimento do SICAPS foram extremamente importantes e " + "amplamente utilizados para o desenvolvimento do SIA, tais" + " como: (i) o acompanhamento das programações físicas e " + "orçamentárias; (ii) o acompanhamento das ações de saúde " + "produzidas; (iii) a agilização do pagamento e controle " + "orçamentário e financeiro; e (iv) a formação de banco de dados " + "para contribuir com a construção do SUS." + ), + } + groups = { + "AB": "APAC de Cirurgia Bariátrica", + "ABO": "APAC de Acompanhamento Pós Cirurgia Bariátrica", + "ACF": "APAC de Confecção de Fístula", + "AD": "APAC de Laudos Diversos", + "AM": "APAC de Medicamentos", + "AMP": "APAC de Acompanhamento Multiprofissional", + "AN": "APAC de Nefrologia", + "AQ": "APAC de Quimioterapia", + "AR": "APAC de Radioterapia", + "ATD": "APAC de Tratamento Dialítico", + "BI": "Boletim de Produção Ambulatorial individualizado", + "IMPBO": "", # TODO + "PA": "Produção Ambulatorial", + "PAM": "", # TODO + "PAR": "", # TODO + "PAS": "", # TODO + "PS": "RAAS Psicossocial", + "SAD": "RAAS de Atenção Domiciliar", + } + + def describe(self, file: File) -> dict: + if file.extension.upper() == ".DBC": + group, _uf, year, month = self.format(file) + + try: + uf = UFs[_uf] + except KeyError: + uf = _uf + + description = { + "name": str(file.basename), + "group": self.groups[group], + "uf": uf, + "month": MONTHS[int(month)], + "year": zfill_year(year), + "size": file.info["size"], + "last_update": file.info["modify"], + } + + return description + return {} + + def format(self, file: File) -> tuple: + if file.extension.upper() in [".DBC", ".DBF"]: + digits = "".join([d for d in file.name if d.isdigit()]) + if "_" in file.name: + name, _ = file.name.split("_") + digits = "".join([d for d in name if d.isdigit()]) + chars, _ = file.name.split(digits) + year, month = digits[:2], digits[2:] + group, uf = chars[:-2].upper(), chars[-2:].upper() + return group, uf, zfill_year(year), month + return () + + def get_files( + self, + group: Union[List[str], str], + uf: Optional[Union[List[str], str]] = None, + year: Optional[Union[list, str, int]] = None, + month: Optional[Union[list, str, int]] = None, + ) -> List[File]: + files = list( + filter(lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files) + ) + + groups = [gr.upper() for gr in to_list(group)] + + if not all(gr in list(self.groups) for gr in groups): + raise ValueError( + f"Unknown SIA Group(s): {set(groups).difference(list(self.groups))}" + ) + + files = list(filter(lambda f: self.format(f)[0] in groups, files)) + + if uf: + ufs = parse_UFs(uf) + files = list(filter(lambda f: self.format(f)[1] in ufs, files)) + + if year or str(year) in ["0", "00"]: + years = [zfill_year(str(m)[-2:]) for m in to_list(year)] + files = list(filter(lambda f: self.format(f)[2] in years, files)) + + if month: + months = [str(y)[-2:].zfill(2) for y in to_list(month)] + files = list(filter(lambda f: self.format(f)[3] in months, files)) + + return files + + +class SIH(Database): + name = "SIH" + paths = ( + Directory("/dissemin/publicos/SIHSUS/199201_200712/Dados"), + Directory("/dissemin/publicos/SIHSUS/200801_/Dados"), + ) + metadata = { + "long_name": "Sistema de Informações Hospitalares", + "source": ( + "https://datasus.saude.gov.br/acesso-a-informacao/morbidade-hospitalar-do-sus-sih-sus/", # noqa + "https://datasus.saude.gov.br/acesso-a-informacao/producao-hospitalar-sih-sus/", # noqa + ), + "description": ( + "A finalidade do AIH (Sistema SIHSUS) é a de transcrever todos os " + "atendimentos que provenientes de internações hospitalares que " + "foram financiadas pelo SUS, e após o processamento, gerarem " + "relatórios para os gestores que lhes possibilitem fazer os " + "pagamentos dos estabelecimentos de saúde. Além disso, o nível " + "Federal recebe mensalmente uma base de dados de todas as " + "internações autorizadas (aprovadas ou não para pagamento) para " + "que possam ser repassados às Secretarias de Saúde os valores de " + "Produção de Média e Alta complexidade além dos valores de CNRAC, " + "FAEC e de Hospitais Universitários – em suas variadas formas de " + "contrato de gestão." + ), + } + groups = { + "RD": "AIH Reduzida", + "RJ": "AIH Rejeitada", + "ER": "AIH Rejeitada com erro", + "SP": "Serviços Profissionais", + "CH": "Cadastro Hospitalar", + "CM": "", # TODO + } + + def describe(self, file: File) -> dict: + if file.extension.upper() in [".DBC", ".DBF"]: + group, _uf, year, month = self.format(file) + + try: + uf = UFs[_uf] + except KeyError: + uf = _uf + + description = { + "name": file.basename, + "group": self.groups[group], + "uf": uf, + "month": MONTHS[int(month)], + "year": zfill_year(year), + "size": file.info["size"], + "last_update": file.info["modify"], + } + + return description + return {} + + def format(self, file: File) -> tuple: + group, _uf = file.name[:2].upper(), file.name[2:4].upper() + year, month = file.name[-4:-2], file.name[-2:] + return group, _uf, zfill_year(year), month + + def get_files( + self, + group: Union[List[str], str], + uf: Optional[Union[List[str], str]] = None, + year: Optional[Union[list, str, int]] = None, + month: Optional[Union[list, str, int]] = None, + ) -> List[File]: + files = list( + filter(lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files) + ) + + groups = [gr.upper() for gr in to_list(group)] + + if not all(gr in list(self.groups) for gr in groups): + raise ValueError( + f"Unknown SIH Group(s): {set(groups).difference(list(self.groups))}" + ) + + files = list(filter(lambda f: self.format(f)[0] in groups, files)) + + if uf: + ufs = parse_UFs(uf) + files = list(filter(lambda f: self.format(f)[1] in ufs, files)) + + if year or str(year) in ["0", "00"]: + years = [zfill_year(str(m)[-2:]) for m in to_list(year)] + files = list(filter(lambda f: self.format(f)[2] in years, files)) + + if month: + months = [str(y)[-2:].zfill(2) for y in to_list(month)] + files = list(filter(lambda f: self.format(f)[3] in months, files)) + + return files + + +class SIM(Database): + name = "SIM" + paths = ( + Directory("/dissemin/publicos/SIM/CID10/DORES"), + Directory("/dissemin/publicos/SIM/CID9/DORES"), + ) + metadata = { + "long_name": "Sistema de Informação sobre Mortalidade", + "source": "http://sim.saude.gov.br", + "description": "", + } + groups = {"CID10": "DO", "CID9": "DOR"} + + def describe(self, file: File) -> dict: + group, _uf, year = self.format(file) + _groups = {v: k for k, v in self.groups.items()} + + try: + uf = UFs[_uf] + except KeyError: + uf = _uf + + description = { + "name": str(file.basename), + "uf": uf, + "year": year, + "group": _groups[group], + "size": file.info["size"], + "last_update": file.info["modify"], + } + + return description + + def format(self, file: File) -> tuple: + if "CID9" in str(file.path): + group, _uf, year = file.name[:-4], file.name[-4:-2], file.name[-2:] + else: + group, _uf, year = file.name[:-6], file.name[-6:-4], file.name[-4:] + return group, _uf, zfill_year(year) + + def get_files( + self, + group: Union[list[str], str], + uf: Optional[Union[list[str], str]] = None, + year: Optional[Union[list, str, int]] = None, + ) -> List[File]: + files = self.files + + groups = [self.groups[g.upper()] for g in to_list(group)] + + files = list(filter(lambda f: self.format(f)[0] in groups, files)) + + if uf: + ufs = parse_UFs(uf) + files = list(filter(lambda f: self.format(f)[1] in ufs, files)) + + if year or str(year) in ["0", "00"]: + years = [zfill_year(y) for y in to_list(year)] + files = list(filter(lambda f: self.format(f)[2] in years, files)) + + return files + + +class SINAN(Database): + name = "SINAN" + paths = ( + Directory("/dissemin/publicos/SINAN/DADOS/FINAIS"), + Directory("/dissemin/publicos/SINAN/DADOS/PRELIM"), + ) + metadata = { + "long_name": "Doenças e Agravos de Notificação", + "source": "https://portalsinan.saude.gov.br/", + "description": ( + "The Notifiable Diseases Information System - Sinan is primarily" + "fed by the notification and investigation of cases of diseases " + "and conditions listed in the national list of compulsorily " + "notifiable diseases (Consolidation Ordinance No. 4, September 28," + " 2017, Annex). However, states and municipalities are allowed to " + "include other important health problems in their region, such as " + "difilobotriasis in the municipality of São Paulo. Its effective " + "use enables the dynamic diagnosis of the occurrence of an event " + "in the population, providing evidence for causal explanations of " + "compulsorily notifiable diseases and indicating risks to which " + "people are exposed. This contributes to identifying the " + "epidemiological reality of a specific geographical area. Its " + "systematic, decentralized use contributes to the democratization " + "of information, allowing all healthcare professionals to access " + "and make it available to the community. Therefore, it is a " + "relevant tool to assist in health planning, define intervention " + "priorities, and evaluate the impact of interventions." + ), + } + + diseases = { + "ACBI": "Acidente de trabalho com material biológico", + "ACGR": "Acidente de trabalho", + "ANIM": "Acidente por Animais Peçonhentos", + "ANTR": "Atendimento Antirrabico", + "BOTU": "Botulismo", + "CANC": "Cancêr relacionado ao trabalho", + "CHAG": "Doença de Chagas Aguda", + "CHIK": "Febre de Chikungunya", + "COLE": "Cólera", + "COQU": "Coqueluche", + "DENG": "Dengue", + "DERM": "Dermatoses ocupacionais", + "DIFT": "Difteria", + "ESQU": "Esquistossomose", + "EXAN": "Doença exantemáticas", + "FMAC": "Febre Maculosa", + "FTIF": "Febre Tifóide", + "HANS": "Hanseníase", + "HANT": "Hantavirose", + "HEPA": "Hepatites Virais", + "IEXO": "Intoxicação Exógena", + "INFL": "Influenza Pandêmica", + "LEIV": "Leishmaniose Visceral", + "LEPT": "Leptospirose", + "LERD": "LER/Dort", + "LTAN": "Leishmaniose Tegumentar Americana", + "MALA": "Malária", + "MENI": "Meningite", + "MENT": "Transtornos mentais relacionados ao trabalho", + "NTRA": "Notificação de Tracoma", + "PAIR": "Perda auditiva por ruído relacionado ao trabalho", + "PEST": "Peste", + "PFAN": "Paralisia Flácida Aguda", + "PNEU": "Pneumoconioses realacionadas ao trabalho", + "RAIV": "Raiva", + "SDTA": "Surto Doenças Transmitidas por Alimentos", + "SIFA": "Sífilis Adquirida", + "SIFC": "Sífilis Congênita", + "SIFG": "Sífilis em Gestante", + "SRC": "Síndrome da Rubéola Congênia", + "TETA": "Tétano Acidental", + "TETN": "Tétano Neonatal", + "TOXC": "Toxoplasmose Congênita", + "TOXG": "Toxoplasmose Gestacional", + "TRAC": "Inquérito de Tracoma", + "TUBE": "Tuberculose", + "VARC": "Varicela", + "VIOL": "Violência doméstica, sexual e/ou outras violências", + "ZIKA": "Zika Vírus", + } + + def describe(self, file: File) -> dict: + if file.extension.upper() == ".DBC": + dis_code, year = self.format(file) + + description = { + "name": str(file.basename), + "disease": self.diseases[dis_code], + "year": zfill_year(year), + "size": file.info["size"], + "last_update": file.info["modify"], + } + return description + return {} + + def format(self, file: File) -> tuple: + year = file.name[-2:] + + if file.name.startswith("SRC"): + dis_code = file.name[:3] + elif file.name == "LEIBR22": + dis_code = "LEIV" # MISPELLED FILE NAME + elif file.name == "LERBR19": + dis_code = "LERD" # ANOTHER ONE + else: + dis_code = file.name[:4] + + return dis_code, zfill_year(year) + + def get_files( + self, + dis_code: Optional[Union[str, list]] = None, + year: Optional[Union[str, int, list]] = None, + ) -> List[File]: + files = list( + filter(lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files) + ) + + if dis_code: + codes = [c.upper() for c in to_list(dis_code)] + + if codes and not all(code in self.diseases for code in codes): + raise ValueError( + f"Unknown disease(s): {set(codes).difference(set(self.diseases))}" + ) + + files = list(filter(lambda f: self.format(f)[0] in codes, files)) + + if year or str(year) in ["0", "00"]: + years = [zfill_year(str(y)[-2:]) for y in to_list(year)] + files = list(filter(lambda f: self.format(f)[1] in years, files)) + + return files + + +class SINASC(Database): + name = "SINASC" + paths = ( + Directory("/dissemin/publicos/SINASC/NOV/DNRES"), + Directory("/dissemin/publicos/SINASC/ANT/DNRES"), + ) + metadata = { + "long_name": "Sistema de Informações sobre Nascidos Vivos", + "source": "http://sinasc.saude.gov.br/", + "description": "", + } + groups = { + "DN": "Declarações de Nascidos Vivos", + "DNR": "Dados dos Nascidos Vivos por UF de residência", + } + + def describe(self, file: File) -> dict: + if file.extension.upper() == ".DBC": + group, _uf, year = self.format(file) + + try: + uf = UFs[_uf] + except KeyError: + uf = _uf + + description = { + "name": file.basename, + "group": self.groups[group], + "uf": uf, + "year": year, + "size": file.info["size"], + "last_update": file.info["modify"], + } + + return description + return {} + + def format(self, file: File) -> tuple: + if file.name == "DNEX2021": + pass + + year = zfill_year(file.name[-2:]) + charname = "".join([c for c in file.name if not c.isnumeric()]) + group, _uf = charname[:-2], charname[-2:] + return group, _uf, zfill_year(year) + + def get_files( + self, + group: Union[List[str], str], + uf: Optional[Union[List[str], str]] = None, + year: Optional[Union[List, str, int]] = None, + ) -> List[File]: + files = self.files + + groups = to_list(group) + + files = list(filter(lambda f: self.format(f)[0] in groups, files)) + + if uf: + if "EX" in to_list(uf): + # DNEX2021 + if len(to_list(uf)) == 1: + return [] + + to_list(uf).remove("EX") + + ufs = parse_UFs(uf) + files = list(filter(lambda f: self.format(f)[1] in ufs, files)) + + if year or str(year) in ["0", "00"]: + years = [zfill_year(str(y)[-2:]) for y in to_list(year)] + files = list(filter(lambda f: self.format(f)[2] in years, files)) + + return files diff --git a/pysus/api/ftp/databases/__init__.py b/pysus/api/ftp/databases/__init__.py deleted file mode 100644 index 8ad52e98..00000000 --- a/pysus/api/ftp/databases/__init__.py +++ /dev/null @@ -1,34 +0,0 @@ -from .ciha import CIHA -from .cnes import CNES -from .ibge_datasus import IBGEDATASUS -from .pni import PNI -from .sia import SIA -from .sih import SIH -from .sim import SIM -from .sinan import SINAN -from .sinasc import SINASC - -AVAILABLE_DATABASES = [ - CIHA, - CNES, - IBGEDATASUS, - PNI, - SIA, - SIH, - SIM, - SINAN, - SINASC, -] - -__all__ = [ - "CIHA", - "CNES", - "IBGEDATASUS", - "PNI", - "SIA", - "SIH", - "SIM", - "SINAN", - "SINASC", - "AVAILABLE_DATABASES", -] diff --git a/pysus/api/ftp/databases/ciha.py b/pysus/api/ftp/databases/ciha.py deleted file mode 100644 index b84d18ab..00000000 --- a/pysus/api/ftp/databases/ciha.py +++ /dev/null @@ -1,103 +0,0 @@ -__all__ = ["CIHA"] - -from typing import List, Optional, Union - -from pysus.api.ftp import Database, Directory, File -from pysus.utils import MONTHS, UFs, parse_UFs, to_list, zfill_year - - -class CIHA(Database): - name = "CIHA" - paths = (Directory("/dissemin/publicos/CIHA/201101_/Dados"),) - metadata = { - "long_name": "Comunicação de Internação Hospitalar e Ambulatorial", - "source": "http://ciha.datasus.gov.br/CIHA/index.php", - "description": ( - "A CIHA foi criada para ampliar o processo de planejamento, " - "programação, controle, avaliação e regulação da assistência à " - "saúde permitindo um conhecimento mais abrangente e profundo dos " - "perfis nosológico e epidemiológico da população brasileira, da " - "capacidade instalada e do potencial de produção de serviços do " - "conjunto de estabelecimentos de saúde do País. O sistema permite " - "o acompanhamento das ações e serviços de saúde custeados " - "por: planos privados de assistência à saúde; planos públicos; " - "pagamento particular por pessoa física; pagamento particular por " - "pessoa jurídica; programas e projetos federais (PRONON, PRONAS, " - "PROADI); recursos próprios das secretarias municipais e estaduais" - " de saúde; DPVAT; gratuidade e, a partir da publicação da " - "Portaria GM/MS nº 2.905/2022, consórcios públicos. As " - "informações registradas na CIHA servem como base para o processo " - "de Certificação de Entidades Beneficentes de Assistência Social " - "em Saúde (CEBAS) e para monitoramento dos programas PRONAS e " - "PRONON" - ), - } - groups = { - "CIHA": "Comunicação de Internação Hospitalar e Ambulatorial", - } - - def describe(self, file: File): - if not isinstance(file, File): - return file - - if file.extension.upper() in [".DBC", ".DBF"]: - group, _uf, year, month = self.format(file) - - try: - uf = UFs[_uf] - except KeyError: - uf = _uf - - description = { - "name": str(file.basename), - "group": self.groups[group], - "uf": uf, - "month": MONTHS[int(month)], - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } - - return description - return file - - def format(self, file: File) -> tuple: - group, _uf = file.name[:4].upper(), file.name[4:6].upper() - year, month = file.name[-4:-2], file.name[-2:] - return group, _uf, zfill_year(year), month - - def get_files( - self, - uf: Optional[Union[List[str], str]] = None, - year: Optional[Union[list, str, int]] = None, - month: Optional[Union[list, str, int]] = None, - group: Union[List[str], str] = "CIHA", - ) -> List[File]: - files = list( - filter(lambda f: f.extension.upper() - in [".DBC", ".DBF"], self.files) - ) - - groups = [gr.upper() for gr in to_list(group)] - - if not all(gr in list(self.groups) for gr in groups): - raise ValueError( - f"Unknown CIHA Group(s): {set( - groups).difference(list(self.groups))}" - ) - - files = list(filter(lambda f: self.format(f)[0] in groups, files)) - - if uf: - ufs = parse_UFs(uf) - files = list(filter(lambda f: self.format(f)[1] in ufs, files)) - - if year or str(year) in ["0", "00"]: - years = [zfill_year(str(m)[-2:]) for m in to_list(year)] - files = list(filter(lambda f: self.format(f)[2] in years, files)) - - if month: - months = [str(y)[-2:].zfill(2) for y in to_list(month)] - files = list(filter(lambda f: self.format(f)[3] in months, files)) - - return files diff --git a/pysus/api/ftp/databases/cnes.py b/pysus/api/ftp/databases/cnes.py deleted file mode 100644 index 61235fba..00000000 --- a/pysus/api/ftp/databases/cnes.py +++ /dev/null @@ -1,135 +0,0 @@ -__all__ = ["CNES"] - -from typing import List, Optional, Union - -from pysus.api.ftp import Database, Directory, File -from pysus.utils import MONTHS, UFs, parse_UFs, to_list, zfill_year - - -class CNES(Database): - name = "CNES" - paths = (Directory("/dissemin/publicos/CNES/200508_/Dados"),) - metadata = { - "long_name": "Cadastro Nacional de Estabelecimentos de Saúde", - "source": "https://cnes.datasus.gov.br/", - "description": ( - "O Cadastro Nacional de Estabelecimentos de Saúde (CNES) é o " - "sistema de informação oficial de cadastramento de informações " - "de todos os estabelecimentos de saúde no país, independentemente " - "de sua natureza jurídica ou de integrarem o Sistema Único de " - "Saúde (SUS). Trata-se do cadastro oficial do Ministério da " - "Saúde (MS) no tocante à realidade da capacidade instalada e " - "mão-de-obra assistencial de saúde no Brasil em estabelecimentos " - "de saúde públicos ou privados, com convênio SUS ou não." - ), - } - groups = { - "DC": "Dados Complementares", - "EE": "Estabelecimento de Ensino", - "EF": "Estabelecimento Filantrópico", - "EP": "Equipes", - "EQ": "Equipamentos", - "GM": "Gestão e Metas", - "HB": "Habilitação", - "IN": "Incentivos", - "LT": "Leitos", - "PF": "Profissional", - "RC": "Regra Contratual", - "SR": "Serviço Especializado", - "ST": "Estabelecimentos", - } - __loaded__ = set() - - def load( - self, - groups: Union[str, List[str]] = None, - ): - """ - Loads CNES Groups into content. Will convert the files and directories - found within FTP Directories into self.content - """ - if not self.__content__: - self.paths[0].load() - self.__content__ |= self.paths[0].__content__ - - if groups: - groups = to_list(groups) - - if not all(group in self.groups for group in [gr.upper() for gr in groups]): - raise ValueError( - f"Unknown CNES group(s): {set( - groups).difference(self.groups)}" - ) - - for group in groups: - group = group.upper() - if group not in self.__loaded__: - directory = self.__content__[group] - directory.load() - self.__content__ |= directory.__content__ - self.__loaded__.add(directory.name) - return self - - def describe(self, file: File) -> dict: - if not isinstance(file, File): - return {} - - if file.name == "GMufAAmm": - # Leftover - return {} - - if file.extension.upper() in [".DBC", ".DBF"]: - group, _uf, year, month = self.format(file) - - try: - uf = UFs[_uf] - except KeyError: - uf = _uf - - description = { - "name": str(file.basename), - "group": self.groups[group], - "uf": uf, - "month": MONTHS[int(month)], - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } - - return description - return {} - - def format(self, file: File) -> tuple: - group, _uf = file.name[:2].upper(), file.name[2:4].upper() - year, month = file.name[-4:-2], file.name[-2:] - return group, _uf, zfill_year(year), month - - def get_files( - self, - group: Union[List[str], str], - uf: Optional[Union[List[str], str]] = None, - year: Optional[Union[list, str, int]] = None, - month: Optional[Union[list, str, int]] = None, - ) -> List[File]: - if not group: - raise ValueError("At least one CNES group is required") - - groups = [gr.upper() for gr in to_list(group)] - - self.load(groups) - - files = list(filter(lambda f: f.name[:2] in groups, self.files)) - - if uf: - ufs = parse_UFs(uf) - files = list(filter(lambda f: f.name[2:4] in ufs, files)) - - if year or str(year) in ["0", "00"]: - years = [str(m)[-2:].zfill(2) for m in to_list(year)] - files = list(filter(lambda f: f.name[-4:-2] in years, files)) - - if month: - months = [str(y)[-2:].zfill(2) for y in to_list(month)] - files = list(filter(lambda f: f.name[-2:] in months, files)) - - return files diff --git a/pysus/api/ftp/databases/ibge_datasus.py b/pysus/api/ftp/databases/ibge_datasus.py deleted file mode 100644 index 39fa6c02..00000000 --- a/pysus/api/ftp/databases/ibge_datasus.py +++ /dev/null @@ -1,86 +0,0 @@ -__all__ = ["IBGEDATASUS"] - -from typing import List, Literal, Optional, Union - -from pysus.api.ftp import Database, Directory, File -from pysus.utils import zfill_year - - -class IBGEDATASUS(Database): - name = "IBGE-DataSUS" - paths = ( - Directory("/dissemin/publicos/IBGE/POP"), - Directory("/dissemin/publicos/IBGE/censo"), - Directory("/dissemin/publicos/IBGE/POPTCU"), - Directory("/dissemin/publicos/IBGE/projpop"), - # Directory("/dissemin/publicos/IBGE/Auxiliar") # this has a different file name pattern # noqa - ) - metadata = { - "long_name": "Populaçao Residente, Censos, Contagens " - "Populacionais e Projeçoes Intercensitarias", - "source": "ftp://ftp.datasus.gov.br/dissemin/publicos/IBGE", - "description": ( - "São aqui apresentados informações sobre a população residente, " - "estratificadas por município, faixas etárias e sexo, obtidas a " - "partir dos Censos Demográficos, Contagens Populacionais " - "e Projeções Intercensitárias." - ), - } - - def describe(self, file: File) -> dict: - if file.extension.upper() in [".ZIP"]: - year = file.name.split(".")[0][-2:] - description = { - "name": str(file.basename), - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } - return description - elif file.extension.upper() == ".DBF": - year = file.name[-2:] - description = { - "name": str(file.basename), - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } - return description - return {} - - def format(self, file: File) -> tuple: - return (file.name[-2:],) - - def get_files( - self, - source: Literal["POP", "censo", "POPTCU", "projpop"] = "POPTCU", - year: Optional[Union[str, int, list]] = None, - *args, - **kwargs, - ) -> List[File]: - sources = ["POP", "censo", "POPTCU", "projpop"] - source_dir = None - - for dir in self.paths: - if source in sources and source in dir.path: - source_dir = dir - - if not source_dir: - raise ValueError(f"Unkown source {source}. Options: {sources}") - - files = source_dir.content - - if year: - if isinstance(year, (str, int)): - files = [ - f for f in files if self.describe(f)["year"] == zfill_year(year) - ] - elif isinstance(year, list): - files = [ - f - for f in files - if str(self.describe(f)["year"]) - in [str(zfill_year(y)) for y in year] - ] - - return files diff --git a/pysus/api/ftp/databases/pni.py b/pysus/api/ftp/databases/pni.py deleted file mode 100644 index ef154287..00000000 --- a/pysus/api/ftp/databases/pni.py +++ /dev/null @@ -1,95 +0,0 @@ -__all__ = ["PNI"] - -from typing import List, Literal, Optional, Union - -from pysus.api.ftp import Database, Directory, File -from pysus.utils import UFs, parse_UFs, to_list, zfill_year - - -class PNI(Database): - name = "PNI" - paths = (Directory("/dissemin/publicos/PNI/DADOS"),) - metadata = { - "long_name": ("Sistema de Informações do Programa Nacional de Imunizações"), - "source": ( - "https://datasus.saude.gov.br/acesso-a-informacao/morbidade-hospitalar-do-sus-sih-sus/", # noqa - "https://datasus.saude.gov.br/acesso-a-informacao/producao-hospitalar-sih-sus/", # noqa - ), - "description": ( - "O SI-PNI é um sistema desenvolvido para possibilitar aos " - "gestores envolvidos no Programa Nacional de Imunização, a " - "avaliação dinâmica do risco quanto à ocorrência de surtos ou " - "epidemias, a partir do registro dos imunobiológicos aplicados e " - "do quantitativo populacional vacinado, agregados por faixa " - "etária, período de tempo e área geográfica. Possibilita também " - "o controle do estoque de imunobiológicos necessário aos " - "administradores que têm a incumbência de programar sua aquisição " - "e distribuição. Controla as indicações de aplicação de " - "vacinas de imunobiológicos especiais e seus eventos adversos, " - "dentro dos Centros de Referências em imunobiológicos especiais." - ), - } - groups = { - "CPNI": "Cobertura Vacinal", # TODO: may be incorrect - "DPNI": "Doses Aplicadas", # TODO: may be incorrect - } - - def describe(self, file: File) -> dict: - if file.extension.upper() in [".DBC", ".DBF"]: - group, _uf, year = self.format(file) - - try: - uf = UFs[_uf] - except KeyError: - uf = _uf - - description = { - "name": file.basename, - "group": self.groups[group], - "uf": uf, - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } - - return description - return {} - - def format(self, file: File) -> tuple: - if len(file.name) != 8: - raise ValueError(f"Can't format {file.name}") - - n = file.name - group, _uf, year = n[:4], n[4:6], n[-2:] - return group, _uf, zfill_year(year) - - def get_files( - self, - group: Union[list, Literal["CNPI", "DPNI"]], - uf: Optional[Union[List[str], str]] = None, - year: Optional[Union[list, str, int]] = None, - ) -> List[File]: - files = list( - filter(lambda f: f.extension.upper() - in [".DBC", ".DBF"], self.files) - ) - - groups = [gr.upper() for gr in to_list(group)] - - if not all(gr in list(self.groups) for gr in groups): - raise ValueError( - f"Unknown PNI Group(s): {set( - groups).difference(list(self.groups))}" - ) - - files = list(filter(lambda f: self.format(f)[0] in groups, files)) - - if uf: - ufs = parse_UFs(uf) - files = list(filter(lambda f: self.format(f)[1] in ufs, files)) - - if year or str(year) in ["0", "00"]: - years = [zfill_year(str(m)[-2:]) for m in to_list(year)] - files = list(filter(lambda f: self.format(f)[2] in years, files)) - - return files diff --git a/pysus/api/ftp/databases/sia.py b/pysus/api/ftp/databases/sia.py deleted file mode 100644 index 3f28d809..00000000 --- a/pysus/api/ftp/databases/sia.py +++ /dev/null @@ -1,122 +0,0 @@ -__all__ = ["SIA"] - -from typing import List, Optional, Union - -from pysus.api.ftp import Database, Directory, File -from pysus.utils import MONTHS, UFs, parse_UFs, to_list, zfill_year - - -class SIA(Database): - name = "SIA" - paths = ( - Directory("/dissemin/publicos/SIASUS/199407_200712/Dados"), - Directory("/dissemin/publicos/SIASUS/200801_/Dados"), - ) - metadata = { - "long_name": "Sistema de Informações Ambulatoriais", - "source": "http://sia.datasus.gov.br/principal/index.php", - "description": ( - "O Sistema de Informação Ambulatorial (SIA) foi instituído pela " - "Portaria GM/MS n.º 896 de 29 de junho de 1990. Originalmente, o " - "SIA foi concebido a partir do projeto SICAPS (Sistema de " - "Informação e Controle Ambulatorial da Previdência Social), em " - "que os conceitos, os objetivos e as diretrizes criados para o " - "desenvolvimento do SICAPS foram extremamente importantes e " - "amplamente utilizados para o desenvolvimento do SIA, tais" - " como: (i) o acompanhamento das programações físicas e " - "orçamentárias; (ii) o acompanhamento das ações de saúde " - "produzidas; (iii) a agilização do pagamento e controle " - "orçamentário e financeiro; e (iv) a formação de banco de dados " - "para contribuir com a construção do SUS." - ), - } - groups = { - "AB": "APAC de Cirurgia Bariátrica", - "ABO": "APAC de Acompanhamento Pós Cirurgia Bariátrica", - "ACF": "APAC de Confecção de Fístula", - "AD": "APAC de Laudos Diversos", - "AM": "APAC de Medicamentos", - "AMP": "APAC de Acompanhamento Multiprofissional", - "AN": "APAC de Nefrologia", - "AQ": "APAC de Quimioterapia", - "AR": "APAC de Radioterapia", - "ATD": "APAC de Tratamento Dialítico", - "BI": "Boletim de Produção Ambulatorial individualizado", - "IMPBO": "", # TODO - "PA": "Produção Ambulatorial", - "PAM": "", # TODO - "PAR": "", # TODO - "PAS": "", # TODO - "PS": "RAAS Psicossocial", - "SAD": "RAAS de Atenção Domiciliar", - } - - def describe(self, file: File) -> dict: - if file.extension.upper() == ".DBC": - group, _uf, year, month = self.format(file) - - try: - uf = UFs[_uf] - except KeyError: - uf = _uf - - description = { - "name": str(file.basename), - "group": self.groups[group], - "uf": uf, - "month": MONTHS[int(month)], - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } - - return description - return {} - - def format(self, file: File) -> tuple: - if file.extension.upper() in [".DBC", ".DBF"]: - digits = "".join([d for d in file.name if d.isdigit()]) - if "_" in file.name: - name, _ = file.name.split("_") - digits = "".join([d for d in name if d.isdigit()]) - chars, _ = file.name.split(digits) - year, month = digits[:2], digits[2:] - group, uf = chars[:-2].upper(), chars[-2:].upper() - return group, uf, zfill_year(year), month - return () - - def get_files( - self, - group: Union[List[str], str], - uf: Optional[Union[List[str], str]] = None, - year: Optional[Union[list, str, int]] = None, - month: Optional[Union[list, str, int]] = None, - ) -> List[File]: - files = list( - filter(lambda f: f.extension.upper() - in [".DBC", ".DBF"], self.files) - ) - - groups = [gr.upper() for gr in to_list(group)] - - if not all(gr in list(self.groups) for gr in groups): - raise ValueError( - f"Unknown SIA Group(s): {set( - groups).difference(list(self.groups))}" - ) - - files = list(filter(lambda f: self.format(f)[0] in groups, files)) - - if uf: - ufs = parse_UFs(uf) - files = list(filter(lambda f: self.format(f)[1] in ufs, files)) - - if year or str(year) in ["0", "00"]: - years = [zfill_year(str(m)[-2:]) for m in to_list(year)] - files = list(filter(lambda f: self.format(f)[2] in years, files)) - - if month: - months = [str(y)[-2:].zfill(2) for y in to_list(month)] - files = list(filter(lambda f: self.format(f)[3] in months, files)) - - return files diff --git a/pysus/api/ftp/databases/sih.py b/pysus/api/ftp/databases/sih.py deleted file mode 100644 index 0c28400d..00000000 --- a/pysus/api/ftp/databases/sih.py +++ /dev/null @@ -1,105 +0,0 @@ -__all__ = ["SIH"] - -from typing import List, Optional, Union - -from pysus.api.ftp import Database, Directory, File -from pysus.utils import MONTHS, UFs, parse_UFs, to_list, zfill_year - - -class SIH(Database): - name = "SIH" - paths = ( - Directory("/dissemin/publicos/SIHSUS/199201_200712/Dados"), - Directory("/dissemin/publicos/SIHSUS/200801_/Dados"), - ) - metadata = { - "long_name": "Sistema de Informações Hospitalares", - "source": ( - "https://datasus.saude.gov.br/acesso-a-informacao/morbidade-hospitalar-do-sus-sih-sus/", # noqa - "https://datasus.saude.gov.br/acesso-a-informacao/producao-hospitalar-sih-sus/", # noqa - ), - "description": ( - "A finalidade do AIH (Sistema SIHSUS) é a de transcrever todos os " - "atendimentos que provenientes de internações hospitalares que " - "foram financiadas pelo SUS, e após o processamento, gerarem " - "relatórios para os gestores que lhes possibilitem fazer os " - "pagamentos dos estabelecimentos de saúde. Além disso, o nível " - "Federal recebe mensalmente uma base de dados de todas as " - "internações autorizadas (aprovadas ou não para pagamento) para " - "que possam ser repassados às Secretarias de Saúde os valores de " - "Produção de Média e Alta complexidade além dos valores de CNRAC, " - "FAEC e de Hospitais Universitários – em suas variadas formas de " - "contrato de gestão." - ), - } - groups = { - "RD": "AIH Reduzida", - "RJ": "AIH Rejeitada", - "ER": "AIH Rejeitada com erro", - "SP": "Serviços Profissionais", - "CH": "Cadastro Hospitalar", - "CM": "", # TODO - } - - def describe(self, file: File) -> dict: - if file.extension.upper() in [".DBC", ".DBF"]: - group, _uf, year, month = self.format(file) - - try: - uf = UFs[_uf] - except KeyError: - uf = _uf - - description = { - "name": file.basename, - "group": self.groups[group], - "uf": uf, - "month": MONTHS[int(month)], - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } - - return description - return {} - - def format(self, file: File) -> tuple: - group, _uf = file.name[:2].upper(), file.name[2:4].upper() - year, month = file.name[-4:-2], file.name[-2:] - return group, _uf, zfill_year(year), month - - def get_files( - self, - group: Union[List[str], str], - uf: Optional[Union[List[str], str]] = None, - year: Optional[Union[list, str, int]] = None, - month: Optional[Union[list, str, int]] = None, - ) -> List[File]: - files = list( - filter(lambda f: f.extension.upper() - in [".DBC", ".DBF"], self.files) - ) - - groups = [gr.upper() for gr in to_list(group)] - - if not all(gr in list(self.groups) for gr in groups): - raise ValueError( - f"Unknown SIH Group(s): {set( - groups).difference(list(self.groups))}" - ) - - files = list(filter(lambda f: self.format(f)[0] in groups, files)) - - if uf: - ufs = parse_UFs(uf) - files = list(filter(lambda f: self.format(f)[1] in ufs, files)) - - if year or str(year) in ["0", "00"]: - years = [zfill_year(str(m)[-2:]) for m in to_list(year)] - files = list(filter(lambda f: self.format(f)[2] in years, files)) - - if month: - months = [str(y)[-2:].zfill(2) for y in to_list(month)] - files = list(filter(lambda f: self.format(f)[3] in months, files)) - - return files diff --git a/pysus/api/ftp/databases/sim.py b/pysus/api/ftp/databases/sim.py deleted file mode 100644 index 0a85aa1f..00000000 --- a/pysus/api/ftp/databases/sim.py +++ /dev/null @@ -1,69 +0,0 @@ -__all__ = ["SIM"] - -from typing import List, Optional, Union - -from pysus.api.ftp import Database, Directory, File -from pysus.utils import UFs, parse_UFs, to_list, zfill_year - - -class SIM(Database): - name = "SIM" - paths = ( - Directory("/dissemin/publicos/SIM/CID10/DORES"), - Directory("/dissemin/publicos/SIM/CID9/DORES"), - ) - metadata = { - "long_name": "Sistema de Informação sobre Mortalidade", - "source": "http://sim.saude.gov.br", - "description": "", - } - groups = {"CID10": "DO", "CID9": "DOR"} - - def describe(self, file: File) -> dict: - group, _uf, year = self.format(file) - _groups = {v: k for k, v in self.groups.items()} - - try: - uf = UFs[_uf] - except KeyError: - uf = _uf - - description = { - "name": str(file.basename), - "uf": uf, - "year": year, - "group": _groups[group], - "size": file.info["size"], - "last_update": file.info["modify"], - } - - return description - - def format(self, file: File) -> tuple: - if "CID9" in str(file.path): - group, _uf, year = file.name[:-4], file.name[-4:-2], file.name[-2:] - else: - group, _uf, year = file.name[:-6], file.name[-6:-4], file.name[-4:] - return group, _uf, zfill_year(year) - - def get_files( - self, - group: Union[list[str], str], - uf: Optional[Union[list[str], str]] = None, - year: Optional[Union[list, str, int]] = None, - ) -> List[File]: - files = self.files - - groups = [self.groups[g.upper()] for g in to_list(group)] - - files = list(filter(lambda f: self.format(f)[0] in groups, files)) - - if uf: - ufs = parse_UFs(uf) - files = list(filter(lambda f: self.format(f)[1] in ufs, files)) - - if year or str(year) in ["0", "00"]: - years = [zfill_year(y) for y in to_list(year)] - files = list(filter(lambda f: self.format(f)[2] in years, files)) - - return files diff --git a/pysus/api/ftp/databases/sinan.py b/pysus/api/ftp/databases/sinan.py deleted file mode 100644 index f272d016..00000000 --- a/pysus/api/ftp/databases/sinan.py +++ /dev/null @@ -1,144 +0,0 @@ -__all__ = ["SINAN"] - -from typing import List, Optional, Union - -from pysus.api.ftp import Database, Directory, File -from pysus.utils import to_list, zfill_year - - -class SINAN(Database): - name = "SINAN" - paths = ( - Directory("/dissemin/publicos/SINAN/DADOS/FINAIS"), - Directory("/dissemin/publicos/SINAN/DADOS/PRELIM"), - ) - metadata = { - "long_name": "Doenças e Agravos de Notificação", - "source": "https://portalsinan.saude.gov.br/", - "description": ( - "The Notifiable Diseases Information System - Sinan is primarily" - "fed by the notification and investigation of cases of diseases " - "and conditions listed in the national list of compulsorily " - "notifiable diseases (Consolidation Ordinance No. 4, September 28," - " 2017, Annex). However, states and municipalities are allowed to " - "include other important health problems in their region, such as " - "difilobotriasis in the municipality of São Paulo. Its effective " - "use enables the dynamic diagnosis of the occurrence of an event " - "in the population, providing evidence for causal explanations of " - "compulsorily notifiable diseases and indicating risks to which " - "people are exposed. This contributes to identifying the " - "epidemiological reality of a specific geographical area. Its " - "systematic, decentralized use contributes to the democratization " - "of information, allowing all healthcare professionals to access " - "and make it available to the community. Therefore, it is a " - "relevant tool to assist in health planning, define intervention " - "priorities, and evaluate the impact of interventions." - ), - } - - diseases = { - "ACBI": "Acidente de trabalho com material biológico", - "ACGR": "Acidente de trabalho", - "ANIM": "Acidente por Animais Peçonhentos", - "ANTR": "Atendimento Antirrabico", - "BOTU": "Botulismo", - "CANC": "Cancêr relacionado ao trabalho", - "CHAG": "Doença de Chagas Aguda", - "CHIK": "Febre de Chikungunya", - "COLE": "Cólera", - "COQU": "Coqueluche", - "DENG": "Dengue", - "DERM": "Dermatoses ocupacionais", - "DIFT": "Difteria", - "ESQU": "Esquistossomose", - "EXAN": "Doença exantemáticas", - "FMAC": "Febre Maculosa", - "FTIF": "Febre Tifóide", - "HANS": "Hanseníase", - "HANT": "Hantavirose", - "HEPA": "Hepatites Virais", - "IEXO": "Intoxicação Exógena", - "INFL": "Influenza Pandêmica", - "LEIV": "Leishmaniose Visceral", - "LEPT": "Leptospirose", - "LERD": "LER/Dort", - "LTAN": "Leishmaniose Tegumentar Americana", - "MALA": "Malária", - "MENI": "Meningite", - "MENT": "Transtornos mentais relacionados ao trabalho", - "NTRA": "Notificação de Tracoma", - "PAIR": "Perda auditiva por ruído relacionado ao trabalho", - "PEST": "Peste", - "PFAN": "Paralisia Flácida Aguda", - "PNEU": "Pneumoconioses realacionadas ao trabalho", - "RAIV": "Raiva", - "SDTA": "Surto Doenças Transmitidas por Alimentos", - "SIFA": "Sífilis Adquirida", - "SIFC": "Sífilis Congênita", - "SIFG": "Sífilis em Gestante", - "SRC": "Síndrome da Rubéola Congênia", - "TETA": "Tétano Acidental", - "TETN": "Tétano Neonatal", - "TOXC": "Toxoplasmose Congênita", - "TOXG": "Toxoplasmose Gestacional", - "TRAC": "Inquérito de Tracoma", - "TUBE": "Tuberculose", - "VARC": "Varicela", - "VIOL": "Violência doméstica, sexual e/ou outras violências", - "ZIKA": "Zika Vírus", - } - - def describe(self, file: File) -> dict: - if file.extension.upper() == ".DBC": - dis_code, year = self.format(file) - - description = { - "name": str(file.basename), - "disease": self.diseases[dis_code], - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } - return description - return {} - - def format(self, file: File) -> tuple: - year = file.name[-2:] - - if file.name.startswith("SRC"): - dis_code = file.name[:3] - elif file.name == "LEIBR22": - dis_code = "LEIV" # MISPELLED FILE NAME - elif file.name == "LERBR19": - dis_code = "LERD" # ANOTHER ONE - else: - dis_code = file.name[:4] - - return dis_code, zfill_year(year) - - def get_files( - self, - dis_code: Optional[Union[str, list]] = None, - year: Optional[Union[str, int, list]] = None, - ) -> List[File]: - files = list( - filter(lambda f: f.extension.upper() - in [".DBC", ".DBF"], self.files) - ) - - if dis_code: - codes = [c.upper() for c in to_list(dis_code)] - - if codes and not all(code in self.diseases for code in codes): - raise ValueError( - f"Unknown disease(s): {set( - codes).difference(set(self.diseases))}" - ) - - files = list(filter(lambda f: self.format(f)[0] in codes, files)) - - if year or str(year) in ["0", "00"]: - years = [zfill_year(str(y)[-2:]) for y in to_list(year)] - files = list(filter(lambda f: self.format(f)[1] in years, files)) - - return files diff --git a/pysus/api/ftp/databases/sinasc.py b/pysus/api/ftp/databases/sinasc.py deleted file mode 100644 index f7e73c29..00000000 --- a/pysus/api/ftp/databases/sinasc.py +++ /dev/null @@ -1,82 +0,0 @@ -__all__ = ["SINASC"] - -from typing import List, Optional, Union - -from pysus.api.ftp import Database, Directory, File -from pysus.utils import UFs, parse_UFs, to_list, zfill_year - - -class SINASC(Database): - name = "SINASC" - paths = ( - Directory("/dissemin/publicos/SINASC/NOV/DNRES"), - Directory("/dissemin/publicos/SINASC/ANT/DNRES"), - ) - metadata = { - "long_name": "Sistema de Informações sobre Nascidos Vivos", - "source": "http://sinasc.saude.gov.br/", - "description": "", - } - groups = { - "DN": "Declarações de Nascidos Vivos", - "DNR": "Dados dos Nascidos Vivos por UF de residência", - } - - def describe(self, file: File) -> dict: - if file.extension.upper() == ".DBC": - group, _uf, year = self.format(file) - - try: - uf = UFs[_uf] - except KeyError: - uf = _uf - - description = { - "name": file.basename, - "group": self.groups[group], - "uf": uf, - "year": year, - "size": file.info["size"], - "last_update": file.info["modify"], - } - - return description - return {} - - def format(self, file: File) -> tuple: - if file.name == "DNEX2021": - pass - - year = zfill_year(file.name[-2:]) - charname = "".join([c for c in file.name if not c.isnumeric()]) - group, _uf = charname[:-2], charname[-2:] - return group, _uf, zfill_year(year) - - def get_files( - self, - group: Union[List[str], str], - uf: Optional[Union[List[str], str]] = None, - year: Optional[Union[List, str, int]] = None, - ) -> List[File]: - files = self.files - - groups = to_list(group) - - files = list(filter(lambda f: self.format(f)[0] in groups, files)) - - if uf: - if "EX" in to_list(uf): - # DNEX2021 - if len(to_list(uf)) == 1: - return [] - - to_list(uf).remove("EX") - - ufs = parse_UFs(uf) - files = list(filter(lambda f: self.format(f)[1] in ufs, files)) - - if year or str(year) in ["0", "00"]: - years = [zfill_year(str(y)[-2:]) for y in to_list(year)] - files = list(filter(lambda f: self.format(f)[2] in years, files)) - - return files From bea300d3b1d94b06fe85f387706f78a1f5320dd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=A3=20Bida=20Vacaro?= Date: Mon, 16 Mar 2026 12:30:13 -0300 Subject: [PATCH 5/6] normalize FileDescription to prepare for ducklake implementation --- pyproject.toml | 11 +- pysus/api/ducklake/models.py | 2 - pysus/api/ftp/client.py | 2 +- pysus/api/ftp/databases.py | 337 ++++++++++++++++------------------- pysus/api/ftp/models.py | 29 +++ pysus/management/__init__.py | 0 6 files changed, 188 insertions(+), 193 deletions(-) create mode 100644 pysus/api/ftp/models.py create mode 100644 pysus/management/__init__.py diff --git a/pyproject.toml b/pyproject.toml index f9504e76..f3df5be4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,10 +2,17 @@ name = "pysus" version = "1.0.1" # changed by semantic-release description = "Tools for dealing with Brazil's Public health data" -authors = ["Flavio Codeco Coelho "] +authors = ["Flavio Codeco Coelho ", "Luã Bida Vacaro "] license = "GPL" -packages = [{include='pysus'}] +packages = [{ include = "pysus"}] + +exclude = [ + "pysus/tests", + "pysus/tests/**", + "pysus/management", + "pysus/management/**" +] [tool.poetry.dependencies] python = ">=3.10,<3.14" diff --git a/pysus/api/ducklake/models.py b/pysus/api/ducklake/models.py index 54d6850c..1cbfb4b6 100644 --- a/pysus/api/ducklake/models.py +++ b/pysus/api/ducklake/models.py @@ -116,7 +116,6 @@ class File(Catalog): __tablename__ = "files" id = Column(Integer, primary_key=True) - group_id = Column( Integer, ForeignKey("pysus.dataset_groups.id"), @@ -126,7 +125,6 @@ class File(Catalog): path = Column(String, nullable=False, unique=True) size = Column(Integer, nullable=False) rows = Column(Integer, nullable=False) - modified = Column(Date, nullable=False) group = relationship( diff --git a/pysus/api/ftp/client.py b/pysus/api/ftp/client.py index 88c46e1b..14eec50e 100644 --- a/pysus/api/ftp/client.py +++ b/pysus/api/ftp/client.py @@ -133,7 +133,7 @@ def __init__(self, path: str, name: str, info: FileInfo) -> None: def info(self) -> Dict[str, str]: """Returns a dictionary with human-readable file information""" return { - "size": humanize.naturalsize(self.__info["size"]), + "size": self.__info["size"], "type": f"{self.extension[1:].upper()} file", "modify": self.__info["modify"].strftime("%Y-%m-%d %I:%M%p"), } diff --git a/pysus/api/ftp/databases.py b/pysus/api/ftp/databases.py index c2dcf47c..e77cb709 100644 --- a/pysus/api/ftp/databases.py +++ b/pysus/api/ftp/databases.py @@ -14,6 +14,7 @@ from pysus.api.ftp import Database, Directory, File from pysus.utils import UFs, parse_UFs, to_list, zfill_year, MONTHS +from .models import FileDescription class CIHA(Database): @@ -46,30 +47,22 @@ class CIHA(Database): "CIHA": "Comunicação de Internação Hospitalar e Ambulatorial", } - def describe(self, file: File): - if not isinstance(file, File): - return file - - if file.extension.upper() in [".DBC", ".DBF"]: - group, _uf, year, month = self.format(file) - - try: - uf = UFs[_uf] - except KeyError: - uf = _uf - - description = { - "name": str(file.basename), - "group": self.groups[group], - "uf": uf, - "month": MONTHS[int(month)], - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } - - return description - return file + def describe(self, file: File) -> Optional[FileDescription]: + if not isinstance(file, File) or file.extension.upper() not in [".DBC", ".DBF"]: + return None + + group, _uf, year, month = self.format(file) + uf = UFs.get(_uf, _uf) + + return FileDescription( + name=str(file.basename), + group=self.groups[group], + uf=uf, + month=MONTHS[int(month)], + year=zfill_year(year), + size=file.info["size"], + last_update=file.info["modify"], + ) def format(self, file: File) -> tuple: group, _uf = file.name[:4].upper(), file.name[4:6].upper() @@ -84,14 +77,16 @@ def get_files( group: Union[List[str], str] = "CIHA", ) -> List[File]: files = list( - filter(lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files) + filter(lambda f: f.extension.upper() + in [".DBC", ".DBF"], self.files) ) groups = [gr.upper() for gr in to_list(group)] if not all(gr in list(self.groups) for gr in groups): raise ValueError( - f"Unknown CIHA Group(s): {set(groups).difference(list(self.groups))}" + f"Unknown CIHA Group(s): {set( + groups).difference(list(self.groups))}" ) files = list(filter(lambda f: self.format(f)[0] in groups, files)) @@ -162,7 +157,8 @@ def load( if not all(group in self.groups for group in [gr.upper() for gr in groups]): raise ValueError( - f"Unknown CNES group(s): {set(groups).difference(self.groups)}" + f"Unknown CNES group(s): {set( + groups).difference(self.groups)}" ) for group in groups: @@ -174,34 +170,24 @@ def load( self.__loaded__.add(directory.name) return self - def describe(self, file: File) -> dict: - if not isinstance(file, File): - return {} + def describe(self, file: File) -> Optional[FileDescription]: + if not isinstance(file, File) or file.name == "GMufAAmm": + return None - if file.name == "GMufAAmm": - # Leftover - return {} + if file.extension.upper() not in [".DBC", ".DBF"]: + return None - if file.extension.upper() in [".DBC", ".DBF"]: - group, _uf, year, month = self.format(file) - - try: - uf = UFs[_uf] - except KeyError: - uf = _uf - - description = { - "name": str(file.basename), - "group": self.groups[group], - "uf": uf, - "month": MONTHS[int(month)], - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } - - return description - return {} + group, _uf, year, month = self.format(file) + + return FileDescription( + name=str(file.basename), + group=self.groups.get(group, group), + uf=UFs.get(_uf, _uf), + month=MONTHS.get(int(month), month), + year=zfill_year(year), + size=file.info.get("size", 0), + last_update=file.info.get("modify"), + ) def format(self, file: File) -> tuple: group, _uf = file.name[:2].upper(), file.name[2:4].upper() @@ -260,26 +246,23 @@ class IBGEDATASUS(Database): ), } - def describe(self, file: File) -> dict: - if file.extension.upper() in [".ZIP"]: + def describe(self, file: File) -> Optional[FileDescription]: + ext = file.extension.upper() + + if ext == ".ZIP": year = file.name.split(".")[0][-2:] - description = { - "name": str(file.basename), - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } - return description - elif file.extension.upper() == ".DBF": + elif ext == ".DBF": year = file.name[-2:] - description = { - "name": str(file.basename), - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } - return description - return {} + else: + return None + + return FileDescription( + name=str(file.basename), + group="Population", + year=zfill_year(year), + size=file.info.get("size", 0), + last_update=file.info.get("modify"), + ) def format(self, file: File) -> tuple: return (file.name[-2:],) @@ -323,7 +306,7 @@ class PNI(Database): name = "PNI" paths = (Directory("/dissemin/publicos/PNI/DADOS"),) metadata = { - "long_name": ("Sistema de Informações do Programa Nacional de Imunizações"), + "long_name": ("Sistema de Informações do Programa Nacional de Imunizações"), # noqa "source": ( "https://datasus.saude.gov.br/acesso-a-informacao/morbidade-hospitalar-do-sus-sih-sus/", # noqa "https://datasus.saude.gov.br/acesso-a-informacao/producao-hospitalar-sih-sus/", # noqa @@ -347,26 +330,20 @@ class PNI(Database): "DPNI": "Doses Aplicadas", # TODO: may be incorrect } - def describe(self, file: File) -> dict: - if file.extension.upper() in [".DBC", ".DBF"]: - group, _uf, year = self.format(file) - - try: - uf = UFs[_uf] - except KeyError: - uf = _uf + def describe(self, file: File) -> Optional[FileDescription]: + if not isinstance(file, File) or file.extension.upper() not in [".DBC", ".DBF"]: + return None - description = { - "name": file.basename, - "group": self.groups[group], - "uf": uf, - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } + group, _uf, year = self.format(file) - return description - return {} + return FileDescription( + name=str(file.basename), + group=self.groups.get(group, group), + uf=UFs.get(_uf, _uf), + year=zfill_year(year), + size=file.info.get("size", 0), + last_update=file.info.get("modify"), + ) def format(self, file: File) -> tuple: if len(file.name) != 8: @@ -383,14 +360,16 @@ def get_files( year: Optional[Union[list, str, int]] = None, ) -> List[File]: files = list( - filter(lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files) + filter(lambda f: f.extension.upper() + in [".DBC", ".DBF"], self.files) ) groups = [gr.upper() for gr in to_list(group)] if not all(gr in list(self.groups) for gr in groups): raise ValueError( - f"Unknown PNI Group(s): {set(groups).difference(list(self.groups))}" + f"Unknown PNI Group(s): {set( + groups).difference(list(self.groups))}" ) files = list(filter(lambda f: self.format(f)[0] in groups, files)) @@ -451,27 +430,21 @@ class SIA(Database): "SAD": "RAAS de Atenção Domiciliar", } - def describe(self, file: File) -> dict: - if file.extension.upper() == ".DBC": - group, _uf, year, month = self.format(file) - - try: - uf = UFs[_uf] - except KeyError: - uf = _uf + def describe(self, file: File) -> Optional[FileDescription]: + if file.extension.upper() != ".DBC": + return None - description = { - "name": str(file.basename), - "group": self.groups[group], - "uf": uf, - "month": MONTHS[int(month)], - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } + group_code, _uf, year, month = self.format(file) - return description - return {} + return FileDescription( + name=str(file.basename), + group=self.groups.get(group_code, group_code), + uf=UFs.get(_uf, _uf), + month=MONTHS.get(int(month), str(month)), + year=zfill_year(year), + size=file.info.get("size", 0), + last_update=file.info.get("modify"), + ) def format(self, file: File) -> tuple: if file.extension.upper() in [".DBC", ".DBF"]: @@ -493,14 +466,16 @@ def get_files( month: Optional[Union[list, str, int]] = None, ) -> List[File]: files = list( - filter(lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files) + filter(lambda f: f.extension.upper() + in [".DBC", ".DBF"], self.files) ) groups = [gr.upper() for gr in to_list(group)] if not all(gr in list(self.groups) for gr in groups): raise ValueError( - f"Unknown SIA Group(s): {set(groups).difference(list(self.groups))}" + f"Unknown SIA Group(s): {set( + groups).difference(list(self.groups))}" ) files = list(filter(lambda f: self.format(f)[0] in groups, files)) @@ -555,27 +530,21 @@ class SIH(Database): "CM": "", # TODO } - def describe(self, file: File) -> dict: - if file.extension.upper() in [".DBC", ".DBF"]: - group, _uf, year, month = self.format(file) - - try: - uf = UFs[_uf] - except KeyError: - uf = _uf - - description = { - "name": file.basename, - "group": self.groups[group], - "uf": uf, - "month": MONTHS[int(month)], - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } - - return description - return {} + def describe(self, file: File) -> Optional[FileDescription]: + if not isinstance(file, File) or file.extension.upper() not in [".DBC", ".DBF"]: + return None + + group_code, _uf, year, month = self.format(file) + + return FileDescription( + name=str(file.basename), + group=self.groups.get(group_code, group_code), + uf=UFs.get(_uf, _uf), + month=MONTHS.get(int(month), str(month)), + year=zfill_year(year), + size=file.info.get("size", 0), + last_update=file.info.get("modify"), + ) def format(self, file: File) -> tuple: group, _uf = file.name[:2].upper(), file.name[2:4].upper() @@ -590,14 +559,16 @@ def get_files( month: Optional[Union[list, str, int]] = None, ) -> List[File]: files = list( - filter(lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files) + filter(lambda f: f.extension.upper() + in [".DBC", ".DBF"], self.files) ) groups = [gr.upper() for gr in to_list(group)] if not all(gr in list(self.groups) for gr in groups): raise ValueError( - f"Unknown SIH Group(s): {set(groups).difference(list(self.groups))}" + f"Unknown SIH Group(s): {set( + groups).difference(list(self.groups))}" ) files = list(filter(lambda f: self.format(f)[0] in groups, files)) @@ -630,25 +601,18 @@ class SIM(Database): } groups = {"CID10": "DO", "CID9": "DOR"} - def describe(self, file: File) -> dict: + def describe(self, file: File) -> Optional[FileDescription]: group, _uf, year = self.format(file) - _groups = {v: k for k, v in self.groups.items()} - - try: - uf = UFs[_uf] - except KeyError: - uf = _uf - - description = { - "name": str(file.basename), - "uf": uf, - "year": year, - "group": _groups[group], - "size": file.info["size"], - "last_update": file.info["modify"], - } - - return description + groups = {v: k for k, v in self.groups.items()} + + return FileDescription( + name=str(file.basename), + uf=UFs.get(_uf, _uf), + year=year, + group=groups.get(group, group), + size=file.info.get("size", 0), + last_update=file.info.get("modify"), + ) def format(self, file: File) -> tuple: if "CID9" in str(file.path): @@ -762,19 +726,20 @@ class SINAN(Database): "ZIKA": "Zika Vírus", } - def describe(self, file: File) -> dict: - if file.extension.upper() == ".DBC": - dis_code, year = self.format(file) + def describe(self, file: File) -> Optional[FileDescription]: + if not isinstance(file, File) or file.extension.upper() != ".DBC": + return None - description = { - "name": str(file.basename), - "disease": self.diseases[dis_code], - "year": zfill_year(year), - "size": file.info["size"], - "last_update": file.info["modify"], - } - return description - return {} + dis_code, year = self.format(file) + + return FileDescription( + name=str(file.basename), + disease=self.diseases.get(dis_code, "Unknown"), + group=dis_code, + year=zfill_year(year), + size=file.info.get("size", 0), + last_update=file.info.get("modify"), + ) def format(self, file: File) -> tuple: year = file.name[-2:] @@ -796,7 +761,8 @@ def get_files( year: Optional[Union[str, int, list]] = None, ) -> List[File]: files = list( - filter(lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files) + filter(lambda f: f.extension.upper() + in [".DBC", ".DBF"], self.files) ) if dis_code: @@ -804,7 +770,8 @@ def get_files( if codes and not all(code in self.diseases for code in codes): raise ValueError( - f"Unknown disease(s): {set(codes).difference(set(self.diseases))}" + f"Unknown disease(s): {set( + codes).difference(set(self.diseases))}" ) files = list(filter(lambda f: self.format(f)[0] in codes, files)) @@ -832,26 +799,20 @@ class SINASC(Database): "DNR": "Dados dos Nascidos Vivos por UF de residência", } - def describe(self, file: File) -> dict: - if file.extension.upper() == ".DBC": - group, _uf, year = self.format(file) - - try: - uf = UFs[_uf] - except KeyError: - uf = _uf - - description = { - "name": file.basename, - "group": self.groups[group], - "uf": uf, - "year": year, - "size": file.info["size"], - "last_update": file.info["modify"], - } - - return description - return {} + def describe(self, file: File) -> Optional[FileDescription]: + if not isinstance(file, File) or file.extension.upper() != ".DBC": + return None + + group_code, _uf, year = self.format(file) + + return FileDescription( + name=str(file.basename), + group=self.groups.get(group_code, group_code), + uf=UFs.get(_uf, _uf), + year=year, + size=file.info.get("size", 0), + last_update=file.info.get("modify"), + ) def format(self, file: File) -> tuple: if file.name == "DNEX2021": diff --git a/pysus/api/ftp/models.py b/pysus/api/ftp/models.py new file mode 100644 index 00000000..56632e34 --- /dev/null +++ b/pysus/api/ftp/models.py @@ -0,0 +1,29 @@ +import dateparser +from pydantic import BaseModel, ConfigDict, field_validator +from typing import Optional, Union +from datetime import datetime + + +class FileDescription(BaseModel): + model_config = ConfigDict(coerce_numbers_to_str=True) + + name: str + group: str + year: int + size: int + last_update: datetime + uf: Optional[str] = None + month: Optional[str] = None + disease: Optional[str] = None + + @field_validator("last_update", mode="before") + @classmethod + def parse_modify_date(cls, v: Union[str, datetime]) -> datetime: + if isinstance(v, datetime): + return v + + parsed = dateparser.parse(str(v)) + if parsed: + return parsed + + return datetime.now() diff --git a/pysus/management/__init__.py b/pysus/management/__init__.py new file mode 100644 index 00000000..e69de29b From 2b1b88845569f0b41bc2faba079138680e62221e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=A3=20Bida=20Vacaro?= Date: Thu, 19 Mar 2026 13:33:21 -0300 Subject: [PATCH 6/6] start building the dadosgov models to extract & describe the files within the datasets --- pysus/api/dadosgov/client.py | 6 +- pysus/api/dadosgov/databases.py | 90 ++++++ pysus/api/dadosgov/models.py | 92 +++++- pysus/api/ducklake/catalog.py | 24 ++ pysus/api/ducklake/storage.py | 15 + pysus/api/ftp/__init__.py | 1 + pysus/api/ftp/client.py | 519 ------------------------------ pysus/api/ftp/databases.py | 37 +-- pysus/api/ftp/models.py | 537 ++++++++++++++++++++++++++++++-- pysus/api/models.py | 29 ++ pysus/management/ingest.py | 116 +++++++ pysus/management/utils.py | 16 + 12 files changed, 901 insertions(+), 581 deletions(-) create mode 100644 pysus/api/dadosgov/databases.py create mode 100644 pysus/api/models.py create mode 100644 pysus/management/ingest.py create mode 100644 pysus/management/utils.py diff --git a/pysus/api/dadosgov/client.py b/pysus/api/dadosgov/client.py index 54b45691..77aae96c 100644 --- a/pysus/api/dadosgov/client.py +++ b/pysus/api/dadosgov/client.py @@ -2,7 +2,7 @@ from typing import List, Optional from pydantic import TypeAdapter from pysus.api.dadosgov.models import ( - DatasetDetail, + Dataset, DatasetSummary, ) from pysus import __version__ @@ -48,6 +48,6 @@ def list_datasets( adapter = TypeAdapter(List[DatasetSummary]) return adapter.validate_python(data) - def get_dataset(self, id: str) -> DatasetDetail: + def get_dataset(self, id: str) -> Dataset: data = self._get(f"/publico/conjuntos-dados/{id}") - return DatasetDetail.model_validate(data) + return Dataset.model_validate(data) diff --git a/pysus/api/dadosgov/databases.py b/pysus/api/dadosgov/databases.py new file mode 100644 index 00000000..8d9e3561 --- /dev/null +++ b/pysus/api/dadosgov/databases.py @@ -0,0 +1,90 @@ +__all__ = [ + "CNES", + "PNI", + "SIA", + "SINAN", +] + +from typing import List, Optional, Union + +from .models import Dataset, Resource +from pysus.utils import UFs, parse_UFs, to_list, zfill_year, MONTHS +from pysus.api.models import FileDescription + + +class CNES(Dataset): + name = "CNES" + ids = ( + "40a0d093-b12f-44a4-bdc7-bae8eb54dd04", + "9455b341-b06e-408e-8e10-54b32b3d74ec", + ) + + def describe(self, file: Resource) -> Optional[FileDescription]: ... + + def format(self, file: Resource) -> tuple: ... + + def get_files( + self, + year: Optional[Union[list, str, int]] = None, + month: Optional[Union[list, str, int]] = None, + ) -> List[Resource]: ... + + +class PNI(Dataset): + name = "PNI" + ids = ( + "2989d396-cb09-47e7-a3b8-a4b951ca0200", + "543aa08a-46c4-44e8-802e-198daa30753d", + "04292d08-ee4f-463a-b7b5-76cfb76775b3", + "7ed6eecc-c254-475c-92c5-daba5727596b", + "783b7456-6a6c-4025-a8bd-8e9caa0fb962", + "c6c3c6f3-2026-48a2-84ac-d8039714a0ba", + "9a25b796-80e3-444a-a4e7-405f5596d8ab", + ) + + def describe(self, file: Resource) -> Optional[FileDescription]: ... + + def format(self, file: Resource) -> tuple: ... + + def get_files( + self, + year: Optional[Union[list, str, int]] = None, + month: Optional[Union[list, str, int]] = None, + ) -> List[Resource]: ... + + +class SIA(Dataset): + name = "SIA" + ids = ("9a335cb7-2b4f-4fce-8947-e8441b4a90af",) + + def describe(self, file: Resource) -> Optional[FileDescription]: ... + + def format(self, file: Resource) -> tuple: ... + + def get_files( + self, + group: Union[List[str], str], + uf: Optional[Union[List[str], str]] = None, + year: Optional[Union[list, str, int]] = None, + month: Optional[Union[list, str, int]] = None, + ) -> List[Resource]: ... + + +class SINAN(Dataset): + name = "SINAN" + ids = ( + "4d5e5d44-58a8-4d67-b8aa-4ef1e4b00a1c", + "5699abe0-0510-4da8-b47d-209b3bb32b34", + "4557ba96-7d52-4a56-bd6f-f99a5af09f77", + "740ce8f4-7a5d-4351-aad4-7623f2490ada", + ) + + def describe(self, file: Resource) -> Optional[FileDescription]: ... + + def format(self, file: Resource) -> tuple: ... + + def get_files( + self, + dis_code: Optional[Union[str, list]] = None, + year: Optional[Union[str, int, list]] = None, + ) -> List[Resource]: ... diff --git a/pysus/api/dadosgov/models.py b/pysus/api/dadosgov/models.py index 407e3560..5d388b1d 100644 --- a/pysus/api/dadosgov/models.py +++ b/pysus/api/dadosgov/models.py @@ -1,10 +1,15 @@ +import zipfile import requests +import urllib3 from pathlib import Path from datetime import datetime as dt from typing import Optional, List, Any, Annotated, Union -from pydantic import BaseModel, Field, BeforeValidator +from pydantic import BaseModel, Field, BeforeValidator, field_validator from pysus import CACHEPATH +from pysus.api.models import FileDescription + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) def to_datetime(value: Any) -> Optional[dt]: @@ -41,12 +46,12 @@ def __str__(self): class Resource(BaseModel): id: str title: str = Field(alias="titulo") - description: str = Field(alias="descricao") + description: Optional[str] = Field(None, alias="descricao") url: str = Field(alias="link") format: str = Field(alias="formato") - size: int = Field(alias="tamanho") + api_size: int = Field(alias="tamanho") cataloging_date: Optional[str] = Field(None, alias="dataCatalogacao") - last_modified: Optional[str] = Field( + last_modified: Optional[str | dt] = Field( None, alias="dataUltimaAtualizacaoArquivo", ) @@ -59,30 +64,78 @@ class Resource(BaseModel): def __str__(self): return self.file_name + @field_validator("last_modified", mode="before") + @classmethod + def parse_date(cls, v: Optional[str]) -> Optional[dt]: + if not v or isinstance(v, dt): + return v + try: + return dt.strptime(v, "%d/%m/%Y") + except ValueError: + return None + + @property + def basename(self) -> str: + name = self.url.split("/")[-1] + return name.rstrip(".zip").replace("_csv", ".csv") + + @property + def size(self) -> int: + try: + response = requests.head( + self.url, + verify=False, + allow_redirects=True, + timeout=5, + ) + return int(response.headers.get("Content-Length", 0)) + except (requests.RequestException, ValueError): + return self.api_size + def download(self, target_dir: Union[str, Path] = CACHEPATH) -> Path: target_path = Path(target_dir) target_path.mkdir(parents=True, exist_ok=True) - output_file = target_path / ( - self.file_name or f"{self.id}.{self.format.lower()}" - ) + tmp_file = target_path / f"{self.id}.download" - response = requests.get(self.url, stream=True) + response = requests.get(self.url, stream=True, verify=False) response.raise_for_status() - with open(output_file, "wb") as f: + with open(tmp_file, "wb") as f: for chunk in response.iter_content(chunk_size=8192): - f.write(chunk) + if chunk: + f.write(chunk) + + if zipfile.is_zipfile(tmp_file): + with zipfile.ZipFile(tmp_file) as z: + members = z.namelist() + + if len(members) == 1: + name = members[0] + output_file = target_path / name + z.extract(name, target_path) + else: + z.extractall(target_path) + output_file = target_path + + tmp_file.unlink() + return output_file + + output_file = target_path / ( + self.file_name or f"{self.id}.{self.format.lower()}" + ) + + tmp_file.rename(output_file) return output_file -class DatasetDetail(BaseModel): +class Dataset(BaseModel): id: str title: str = Field(alias="titulo") slug: str = Field(alias="nome") organization: str = Field(alias="organizacao") - description: str = Field(alias="descricao") + description: Optional[str] = Field(None, alias="descricao") license: Optional[str] = Field(None, alias="licenca") maintainer: Optional[str] = Field(None, alias="responsavel") maintainer_email: Optional[str] = Field(None, alias="emailResponsavel") @@ -93,8 +146,7 @@ class DatasetDetail(BaseModel): is_open_data: Bool = Field(alias="dadosAbertos") is_discontinued: Bool = Field(alias="descontinuado") is_private: Bool = Field(False, alias="privado") - metadata_updated: DateTime = Field( - None, alias="dataUltimaAtualizacaoMetadados") + metadata_updated: DateTime = Field(None, alias="dataUltimaAtualizacaoMetadados") file_updated: DateTime = Field(None, alias="dataUltimaAtualizacaoArquivo") cataloging_date: DateTime = Field(None, alias="dataCatalogacao") visibility: str = Field(alias="visibilidade") @@ -105,6 +157,18 @@ class DatasetDetail(BaseModel): def __str__(self): return self.id + def describe(self, resource: Resource) -> FileDescription: + return FileDescription( + name=resource.basename, + group=self.slug, + year=int, + size=resource.size, + last_update=resource.last_modified or self.file_updated or dt.now(), + uf=None, + month=None, + disease=self.title, + ) + class DatasetSummary(BaseModel): id: str diff --git a/pysus/api/ducklake/catalog.py b/pysus/api/ducklake/catalog.py index e69de29b..738de713 100644 --- a/pysus/api/ducklake/catalog.py +++ b/pysus/api/ducklake/catalog.py @@ -0,0 +1,24 @@ +from typing import List + + +class CatalogBrowser: + def __init__(self, client): + self.client = client + + def list_datasets(self) -> List[str]: + res = self.client.con.execute("SELECT name FROM datasets").fetchall() + return [r[0] for r in res] + + def get_groups(self, dataset_name: str): + query = f""" + SELECT g.name, g.id + FROM dataset_groups g + JOIN datasets d ON g.dataset_id = d.id + WHERE d.name = '{dataset_name}' + """ + return self.client.con.execute(query).df() + + def get_files(self, group_id: int): + return self.client.con.execute( + f"SELECT * FROM files WHERE group_id = {group_id}" + ).df() diff --git a/pysus/api/ducklake/storage.py b/pysus/api/ducklake/storage.py index e69de29b..caf36c1e 100644 --- a/pysus/api/ducklake/storage.py +++ b/pysus/api/ducklake/storage.py @@ -0,0 +1,15 @@ +import duckdb + + +class StorageManager: + def __init__(self, connection: duckdb.DuckDBPyConnection): + self.con = connection + + def query(self, sql: str): + return self.con.execute(sql).df() + + def get_file_url(self, path: str) -> str: + return f"s3://pysus/public/{path}" + + def list_tables(self): + return self.con.execute("SHOW TABLES").df() diff --git a/pysus/api/ftp/__init__.py b/pysus/api/ftp/__init__.py index 852efe38..af4485c2 100644 --- a/pysus/api/ftp/__init__.py +++ b/pysus/api/ftp/__init__.py @@ -1,5 +1,6 @@ from .client import * # noqa from .databases import * # noqa +from .models import * # noqa AVAILABLE_DATABASES = [ diff --git a/pysus/api/ftp/client.py b/pysus/api/ftp/client.py index 14eec50e..f74598e0 100644 --- a/pysus/api/ftp/client.py +++ b/pysus/api/ftp/client.py @@ -1,56 +1,22 @@ from __future__ import annotations -__all__ = ["File", "Directory", "Database"] - -import asyncio -import os import pathlib -from datetime import datetime from ftplib import FTP from typing import ( - Any, - Dict, Final, - List, Optional, Protocol, - Tuple, - TypedDict, - Union, runtime_checkable, ) -import humanize -from aioftp import Client -from loguru import logger -from tqdm import tqdm -from typing_extensions import Self from pysus import CACHEPATH from pysus.data.local import Data -from pysus.utils import to_list - -# Type aliases -PathLike = Union[str, pathlib.Path] -FileContent = Dict[str, Union["Directory", "File"]] -# Constants __cachepath__: Final[pathlib.Path] = pathlib.Path(CACHEPATH) __cachepath__.mkdir(exist_ok=True) -# Cache storage -DIRECTORY_CACHE: Dict[str, "Directory"] = {} - - -class FileInfo(TypedDict): - """File information dictionary type""" - - size: Union[int, str] - type: str - modify: datetime - - @runtime_checkable class Downloadable(Protocol): async def download(self, local_dir: str) -> Data: @@ -77,488 +43,3 @@ def close(cls) -> None: if cls._instance and cls._instance.sock: cls._instance.close() cls._instance = None - - -class File: - """ - FTP File representation with improved type safety. - - This class provides methods for interacting with files on the DataSUS FTP - server. It includes functionality for downloading files synchronously and - asynchronously, as well as retrieving file information in a human-readable - format. - - Attributes: - name (str): The name of the file without the extension. - extension (str): The file extension. - basename (str): The full name of the file including the extension. - path (str): The full path to the file on the FTP server. - parent_path (str): The directory path where the file is located on the - FTP server. - __info (FileInfo): Metadata about the file, including size, type, and - modification date. - - Methods: - info() -> Dict[str, str]: - Returns a dictionary with human-readable file information, - including size, type, and modification date. - - download( - local_dir: str = CACHEPATH, _pbar: Optional[tqdm] = None - ) -> Data: - Downloads the file to the specified local directory. If a progress - bar (_pbar) is provided, it updates the progress bar during the - download. - - async_download(local_dir: str = CACHEPATH) -> Data: - Asynchronously downloads the file to the specified local directory. - - _line_parser(file_line: bytes) -> Tuple[str, Dict[str, Any]]: - Static method to parse a line from the FTP LIST command and - extract file information. - """ - - def __init__(self, path: str, name: str, info: FileInfo) -> None: - self.name, self.extension = os.path.splitext(name) - self.basename: str = f"{self.name}{self.extension}" - self.path: str = ( - f"{path}/{self.basename}" - if not path.endswith("/") - else f"{path}{self.basename}" - ) - self.parent_path: str = os.path.dirname(self.path) - self.__info: FileInfo = info - - @property - def info(self) -> Dict[str, str]: - """Returns a dictionary with human-readable file information""" - return { - "size": self.__info["size"], - "type": f"{self.extension[1:].upper()} file", - "modify": self.__info["modify"].strftime("%Y-%m-%d %I:%M%p"), - } - - def download( - self, local_dir: str = CACHEPATH, _pbar: Optional[tqdm] = None - ) -> Data: - """Downloads the file to the specified local directory""" - target_dir = pathlib.Path(local_dir) - target_dir.mkdir(exist_ok=True, parents=True) - - filepath = target_dir / self.basename - filesize = int(self.__info["size"]) - - # Check for existing files - for ext in (".parquet", ".dbf", ""): - existing = filepath.with_suffix(ext) - if existing.exists(): - if _pbar: - _pbar.update(filesize - _pbar.n) - return Data(str(existing), _pbar=_pbar) # type: ignore - - if _pbar: - _pbar.unit = "B" - _pbar.unit_scale = True - _pbar.reset(total=filesize) - _pbar.set_description(self.basename) - - try: - ftp = FTPSingleton.get_instance() - with open(filepath, "wb") as output: - - def callback(data: bytes) -> None: - output.write(data) - if _pbar: - _pbar.update(len(data)) - - ftp.retrbinary(f"RETR {self.path}", callback) - - except Exception as exc: - if filepath.exists(): - filepath.unlink() - raise exc - finally: - FTPSingleton.close() - - if _pbar: - _pbar.update(filesize - _pbar.n) - return Data(str(filepath), _pbar=_pbar) # type: ignore - - async def async_download(self, local_dir: str = CACHEPATH) -> Data: - """ - Asynchronously downloads the file to the specified local directory - """ - target_dir = pathlib.Path(local_dir) - target_dir.mkdir(exist_ok=True, parents=True) - filepath = target_dir / self.basename - - # Check existing files - for ext in (".parquet", ".dbf", ""): - existing = filepath.with_suffix(ext) - if existing.exists(): - return Data(str(existing)) # type: ignore - - async with Client.context( - host="ftp.datasus.gov.br", parse_list_line_custom=self._line_parser - ) as client: - await client.login() - await client.download(self.path, str(filepath), write_into=True) - - return Data(str(filepath)) # type: ignore - - @staticmethod - def _line_parser(file_line: bytes) -> Tuple[str, Dict[str, Any]]: - """Static method to parse a line from the FTP LIST command and extract - file information - """ - line = file_line.decode("utf-8") - if "" in line: - date, time, _, *name = line.strip().split() - info = {"size": 0, "type": "dir"} - name = " ".join(name) - else: - date, time, size, name = line.strip().split() - info = {"size": size, "type": "file"} - - modify = datetime.strptime(f"{date} {time}", "%m-%d-%y %I:%M%p") - info["modify"] = modify.strftime("%m/%d/%Y %I:%M%p") - return name, info - - def __str__(self) -> str: - return str(self.basename) - - def __repr__(self) -> str: - return str(self.basename) - - def __hash__(self): - return hash(self.path) - - def __eq__(self, other): - if isinstance(other, File): - return self.path == other.path - return False - - -class Directory: - """ - Directory class with caching and lazy loading. - - The Directory class represents a directory in a file system and includes - mechanisms for caching instances and lazy loading of directory content. - When a Directory instance is created, it normalizes the provided path - and caches the instance. The content of the directory is not loaded - immediately; instead, it is loaded when the `content` property or the - `load` method is accessed or called. - - Attributes: - path (str): The normalized path of the directory. - name (str): The name of the directory. - parent (Directory): The parent directory instance. - loaded (bool): Indicates whether the directory content has been loaded. - __content__ (Dict[str, Union[File, Directory]]): A dictionary - containing the directory's content, with names as keys and File or - Directory instances as values. - - Methods: - _normalize_path(path: str) -> str: Normalizes the given path. - _get_root_directory() -> Directory: Returns the root directory - instance, creating it if necessary. - _init_root_child(name: str) -> None: Initializes a root child - directory. - _init_regular(parent_path: str, name: str) -> None: Initializes a - regular directory. - content() -> List[Union[Directory, File]]: Returns the content of the - directory, loading it if necessary. - load() -> Self: Loads the content of the directory and marks it as - loaded. - """ - - name: str - path: str - parent: "Directory" - loaded: bool - __content__: Dict[str, Union[File, "Directory"]] - - def __new__(cls, path: str, _is_root_child: bool = False) -> "Directory": - normalized_path = os.path.normpath(path) - - # Handle root directory case - if normalized_path == "/": - return cls._get_root_directory() - - # Return cached instance if exists - if normalized_path in DIRECTORY_CACHE: - return DIRECTORY_CACHE[normalized_path] - - # Use os.path.split for reliable path splitting - parent_path, name = os.path.split(normalized_path) - - # Handle empty parent path - if not parent_path: - parent_path = "/" - # Handle parent paths that don't start with / - elif not parent_path.startswith("/"): - parent_path = "/" + parent_path - - # Create new instance - instance = super().__new__(cls) - instance.path = normalized_path - - if _is_root_child: - instance._init_root_child(name) - else: - instance._init_regular(parent_path, name) - - DIRECTORY_CACHE[normalized_path] = instance - return instance - - @staticmethod - def _normalize_path(path: str) -> str: - """Normalizes the given path""" - path = f"/{path}" if not path.startswith("/") else path - return path.removesuffix("/") - - @classmethod - def _get_root_directory(cls) -> Directory: - """Returns the root directory instance, creating it if necessary""" - if "/" not in DIRECTORY_CACHE: - root = super().__new__(cls) - root.parent = root - root.name = "/" - root.path = "/" - root.loaded = False - root.__content__ = {} - DIRECTORY_CACHE["/"] = root - return DIRECTORY_CACHE["/"] - - def _init_root_child(self, name: str) -> None: - """Initializes a root child directory""" - self.parent = DIRECTORY_CACHE["/"] - self.name = name - self.loaded = False - self.__content__ = {} - - def _init_regular(self, parent_path: str, name: str) -> None: - """Initializes a regular directory""" - self.parent = Directory(parent_path) - self.name = name - self.loaded = False - self.__content__ = {} - - @property - def content(self) -> List[Union[Directory, File]]: - """Returns the content of the directory, loading it if necessary""" - if not self.loaded: - self.load() - return list(self.__content__.values()) - - def load(self) -> Self: - """Loads the content of the directory and marks it as loaded""" - self.__content__ |= load_directory_content(self.path) - self.loaded = True - return self - - def reload(self): - """ - Reloads the content of the Directory - """ - self.loaded = False - return self.load() - - def __str__(self) -> str: - return self.path - - def __repr__(self) -> str: - return self.path - - def __hash__(self): - return hash(self.path) - - def __eq__(self, other): - if isinstance(other, Directory): - return self.path == other.path - return False - - -def load_directory_content(path: str) -> FileContent: - """Directory content loading""" - content: FileContent = {} - - try: - ftp = FTPSingleton.get_instance() - ftp.cwd(path) - path = path.removesuffix("/") - - def line_parser(line: str): - if "" in line: - date, time, _, name = line.strip().split(maxsplit=3) - modify = datetime.strptime(f"{date} {time}", "%m-%d-%y %I:%M%p") - info = {"size": 0, "type": "dir", "modify": modify} - xpath = f"{path}/{name}" - content[name] = Directory(xpath) - else: - date, time, size, name = line.strip().split(maxsplit=3) - modify = datetime.strptime(f"{date} {time}", "%m-%d-%y %I:%M%p") - info: FileInfo = { - "size": size, - "type": "file", - "modify": modify, - } - content[name] = File(path, name, info) - - ftp.retrlines("LIST", line_parser) - except Exception as exc: - raise exc - finally: - FTPSingleton.close() - - to_remove = [ - name - for name in content - if name.upper().endswith(".DBF") - and name.upper().replace(".DBF", ".DBC") in content - ] - - for name in to_remove: - del content[name] - - return content - - -class Database: - """ - Base class for PySUS databases. Contains common functions - for accessing DataSUS FTP server. With this class, it is - possible to construct database classes for different DataSUS - files, sharing state and functionalities. - - Parameters - ftp [FTP]: ftplib.FTP object for connecting in DataSUS server. - name [str]: database name - paths [list[Directory]]: server paths where the files are located - files [list[Files]]: list of parsed Files from Database content - metadata [dict]: dict containing database's metadata information - - Methods - load(): Loads the database paths content to its own content - describe(file): describes a file according to each database's - spec. Returns a dict with file information - format(file): extracts from file name database related info, such as - year, month, UF and/or other useful info for the DB - get_files(Any): filters files using database related format, depending - on the database's files specs - """ - - ftp: FTP - name: str - paths: Tuple[Directory, ...] - metadata: dict - __content__: Dict[str, Union[Directory, File]] - - def __init__(self) -> None: - self.ftp = FTP("ftp.datasus.gov.br") - self.__content__ = {} - - def __repr__(self) -> str: - return f"{self.name} - {self.metadata['long_name']}" - - @property - def content(self) -> List[Union[Directory, File]]: - """ - Lists Database content. The `paths` will be loaded if this property is - called or if explicitly using `load()`. To add specific Directory - inside content, `load()` the directory and call `content` again. - """ - if not self.__content__: - logger.info( - "content is not loaded, use `load()` to load default paths") - return [] - return sorted(list(self.__content__.values()), key=str) - - @property - def files(self) -> List[File]: - """ - Lists Files inside content. To load a specific Directory inside - content, just `load()` this directory and list files again. - """ - return [f for f in self.content if isinstance(f, File)] - - def load( - self, - directories: Optional[ - Union[Directory, List[Directory], Tuple[Directory, ...]] - ] = None, - ) -> Database: - """ - Loads specific directories to Database content. Will aggregate the - files found within Directories into Database.content. - """ - if not directories: - directories = list(self.paths) - - directories_list = to_list(directories) - - for directory in directories_list: - if not isinstance(directory, Directory): - raise ValueError("Invalid directory provided.") - - directory.load() - self.__content__.update(directory.__content__) - return self - - def describe(self, file: File) -> dict: - """ - Receives a `File` and returns a dict with its information, - according to the database's specifications. This method is - helpful to return the FTP's file in a humanized format - - Parameters - file [File]: a `File` instance - """ - ... - - def format(self, file: File) -> tuple: - """ - Formats a File based on the database specifications, - extracting its name's parameters given a pattern. - - Parameters - file [File]: a `File` instance - """ - ... - - def get_files(self, *args, **kwargs) -> list[File]: - """ - Filters the list of `File`s according to each database file - pattern, as UFs, Groups, Years, Months, etc. This method will - also be responsible to look for wrong values within the file - pattern and possible extra characters in its basename - """ - ... - - def download(self, files: List[File], local_dir: str = CACHEPATH) -> List[str]: - """ - Downloads a list of Files. - """ - files = to_list(files) - pbar = tqdm(total=len(files), dynamic_ncols=True) - dfiles = [] - for file in files: - if isinstance(file, File): - dfiles.append(file.download(local_dir=local_dir, _pbar=pbar)) - pbar.close() - if len(dfiles) == 1: - return dfiles[0] - return dfiles - - async def async_download(self, files: List[File], local_dir: str = CACHEPATH): - """ - Asynchronously downloads a list of files - """ - - async def download_file(file): - if isinstance(file, File): - await file.async_download(local_dir=local_dir) - - tasks = [download_file(file) for file in files] - await asyncio.gather(*tasks) diff --git a/pysus/api/ftp/databases.py b/pysus/api/ftp/databases.py index e77cb709..92c7e387 100644 --- a/pysus/api/ftp/databases.py +++ b/pysus/api/ftp/databases.py @@ -12,9 +12,9 @@ from typing import List, Optional, Union, Literal -from pysus.api.ftp import Database, Directory, File +from pysus.api.ftp.models import Database, Directory, File from pysus.utils import UFs, parse_UFs, to_list, zfill_year, MONTHS -from .models import FileDescription +from pysus.api.models import FileDescription class CIHA(Database): @@ -77,16 +77,14 @@ def get_files( group: Union[List[str], str] = "CIHA", ) -> List[File]: files = list( - filter(lambda f: f.extension.upper() - in [".DBC", ".DBF"], self.files) + filter(lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files) ) groups = [gr.upper() for gr in to_list(group)] if not all(gr in list(self.groups) for gr in groups): raise ValueError( - f"Unknown CIHA Group(s): {set( - groups).difference(list(self.groups))}" + f"Unknown CIHA Group(s): {set(groups).difference(list(self.groups))}" ) files = list(filter(lambda f: self.format(f)[0] in groups, files)) @@ -157,8 +155,7 @@ def load( if not all(group in self.groups for group in [gr.upper() for gr in groups]): raise ValueError( - f"Unknown CNES group(s): {set( - groups).difference(self.groups)}" + f"Unknown CNES group(s): {set(groups).difference(self.groups)}" ) for group in groups: @@ -360,16 +357,14 @@ def get_files( year: Optional[Union[list, str, int]] = None, ) -> List[File]: files = list( - filter(lambda f: f.extension.upper() - in [".DBC", ".DBF"], self.files) + filter(lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files) ) groups = [gr.upper() for gr in to_list(group)] if not all(gr in list(self.groups) for gr in groups): raise ValueError( - f"Unknown PNI Group(s): {set( - groups).difference(list(self.groups))}" + f"Unknown PNI Group(s): {set(groups).difference(list(self.groups))}" ) files = list(filter(lambda f: self.format(f)[0] in groups, files)) @@ -466,16 +461,14 @@ def get_files( month: Optional[Union[list, str, int]] = None, ) -> List[File]: files = list( - filter(lambda f: f.extension.upper() - in [".DBC", ".DBF"], self.files) + filter(lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files) ) groups = [gr.upper() for gr in to_list(group)] if not all(gr in list(self.groups) for gr in groups): raise ValueError( - f"Unknown SIA Group(s): {set( - groups).difference(list(self.groups))}" + f"Unknown SIA Group(s): {set(groups).difference(list(self.groups))}" ) files = list(filter(lambda f: self.format(f)[0] in groups, files)) @@ -559,16 +552,14 @@ def get_files( month: Optional[Union[list, str, int]] = None, ) -> List[File]: files = list( - filter(lambda f: f.extension.upper() - in [".DBC", ".DBF"], self.files) + filter(lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files) ) groups = [gr.upper() for gr in to_list(group)] if not all(gr in list(self.groups) for gr in groups): raise ValueError( - f"Unknown SIH Group(s): {set( - groups).difference(list(self.groups))}" + f"Unknown SIH Group(s): {set(groups).difference(list(self.groups))}" ) files = list(filter(lambda f: self.format(f)[0] in groups, files)) @@ -761,8 +752,7 @@ def get_files( year: Optional[Union[str, int, list]] = None, ) -> List[File]: files = list( - filter(lambda f: f.extension.upper() - in [".DBC", ".DBF"], self.files) + filter(lambda f: f.extension.upper() in [".DBC", ".DBF"], self.files) ) if dis_code: @@ -770,8 +760,7 @@ def get_files( if codes and not all(code in self.diseases for code in codes): raise ValueError( - f"Unknown disease(s): {set( - codes).difference(set(self.diseases))}" + f"Unknown disease(s): {set(codes).difference(set(self.diseases))}" ) files = list(filter(lambda f: self.format(f)[0] in codes, files)) diff --git a/pysus/api/ftp/models.py b/pysus/api/ftp/models.py index 56632e34..d6a0bb0e 100644 --- a/pysus/api/ftp/models.py +++ b/pysus/api/ftp/models.py @@ -1,29 +1,524 @@ -import dateparser -from pydantic import BaseModel, ConfigDict, field_validator -from typing import Optional, Union +from __future__ import annotations + +__all__ = ["File", "Directory", "Database"] + +import asyncio +import os +import pathlib from datetime import datetime +from ftplib import FTP +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, + Union, + TypedDict, +) + +from aioftp import Client +from loguru import logger +from tqdm import tqdm +from typing_extensions import Self + +from pysus import CACHEPATH +from pysus.data.local import Data +from pysus.utils import to_list +from .client import FTPSingleton + + +DIRECTORY_CACHE: Dict[str, "Directory"] = {} +FileContent = Dict[str, Union["Directory", "File"]] + + +class FileInfo(TypedDict): + """File information dictionary type""" + + size: Union[int, str] + type: str + modify: datetime + + +class File: + """ + FTP File representation with improved type safety. + + This class provides methods for interacting with files on the DataSUS FTP + server. It includes functionality for downloading files synchronously and + asynchronously, as well as retrieving file information in a human-readable + format. + + Attributes: + name (str): The name of the file without the extension. + extension (str): The file extension. + basename (str): The full name of the file including the extension. + path (str): The full path to the file on the FTP server. + parent_path (str): The directory path where the file is located on the + FTP server. + __info (FileInfo): Metadata about the file, including size, type, and + modification date. + + Methods: + info() -> Dict[str, str]: + Returns a dictionary with human-readable file information, + including size, type, and modification date. + + download( + local_dir: str = CACHEPATH, _pbar: Optional[tqdm] = None + ) -> Data: + Downloads the file to the specified local directory. If a progress + bar (_pbar) is provided, it updates the progress bar during the + download. + + async_download(local_dir: str = CACHEPATH) -> Data: + Asynchronously downloads the file to the specified local directory. + + _line_parser(file_line: bytes) -> Tuple[str, Dict[str, Any]]: + Static method to parse a line from the FTP LIST command and + extract file information. + """ + + def __init__(self, path: str, name: str, info: FileInfo) -> None: + self.name, self.extension = os.path.splitext(name) + self.basename: str = f"{self.name}{self.extension}" + self.path: str = ( + f"{path}/{self.basename}" + if not path.endswith("/") + else f"{path}{self.basename}" + ) + self.parent_path: str = os.path.dirname(self.path) + self.__info: FileInfo = info + + @property + def info(self) -> Dict[str, str]: + """Returns a dictionary with human-readable file information""" + return { + "size": self.__info["size"], + "type": f"{self.extension[1:].upper()}", + "modify": self.__info["modify"].strftime("%Y-%m-%d %I:%M%p"), + } + + def download( + self, local_dir: str = CACHEPATH, _pbar: Optional[tqdm] = None + ) -> Data: + """Downloads the file to the specified local directory""" + target_dir = pathlib.Path(local_dir) + target_dir.mkdir(exist_ok=True, parents=True) + + filepath = target_dir / self.basename + filesize = int(self.__info["size"]) + + # Check for existing files + for ext in (".parquet", ".dbf", ""): + existing = filepath.with_suffix(ext) + if existing.exists(): + if _pbar: + _pbar.update(filesize - _pbar.n) + return Data(str(existing), _pbar=_pbar) # type: ignore + + if _pbar: + _pbar.unit = "B" + _pbar.unit_scale = True + _pbar.reset(total=filesize) + _pbar.set_description(self.basename) + try: + ftp = FTPSingleton.get_instance() + with open(filepath, "wb") as output: -class FileDescription(BaseModel): - model_config = ConfigDict(coerce_numbers_to_str=True) + def callback(data: bytes) -> None: + output.write(data) + if _pbar: + _pbar.update(len(data)) + + ftp.retrbinary(f"RETR {self.path}", callback) + + except Exception as exc: + if filepath.exists(): + filepath.unlink() + raise exc + finally: + FTPSingleton.close() + + if _pbar: + _pbar.update(filesize - _pbar.n) + return Data(str(filepath), _pbar=_pbar) # type: ignore + + async def async_download(self, local_dir: str = CACHEPATH) -> Data: + """ + Asynchronously downloads the file to the specified local directory + """ + target_dir = pathlib.Path(local_dir) + target_dir.mkdir(exist_ok=True, parents=True) + filepath = target_dir / self.basename + + # Check existing files + for ext in (".parquet", ".dbf", ""): + existing = filepath.with_suffix(ext) + if existing.exists(): + return Data(str(existing)) # type: ignore + + async with Client.context( + host="ftp.datasus.gov.br", parse_list_line_custom=self._line_parser + ) as client: + await client.login() + await client.download(self.path, str(filepath), write_into=True) + + return Data(str(filepath)) # type: ignore + + @staticmethod + def _line_parser(file_line: bytes) -> Tuple[str, Dict[str, Any]]: + """Static method to parse a line from the FTP LIST command and extract + file information + """ + line = file_line.decode("utf-8") + if "" in line: + date, time, _, *name = line.strip().split() + info = {"size": 0, "type": "dir"} + name = " ".join(name) + else: + date, time, size, name = line.strip().split() + info = {"size": size, "type": "file"} + + modify = datetime.strptime(f"{date} {time}", "%m-%d-%y %I:%M%p") + info["modify"] = modify.strftime("%m/%d/%Y %I:%M%p") + return name, info + + def __str__(self) -> str: + return str(self.basename) + + def __repr__(self) -> str: + return str(self.basename) + + def __hash__(self): + return hash(self.path) + + def __eq__(self, other): + if isinstance(other, File): + return self.path == other.path + return False + + +class Directory: + """ + Directory class with caching and lazy loading. + + The Directory class represents a directory in a file system and includes + mechanisms for caching instances and lazy loading of directory content. + When a Directory instance is created, it normalizes the provided path + and caches the instance. The content of the directory is not loaded + immediately; instead, it is loaded when the `content` property or the + `load` method is accessed or called. + + Attributes: + path (str): The normalized path of the directory. + name (str): The name of the directory. + parent (Directory): The parent directory instance. + loaded (bool): Indicates whether the directory content has been loaded. + __content__ (Dict[str, Union[File, Directory]]): A dictionary + containing the directory's content, with names as keys and File or + Directory instances as values. + + Methods: + _normalize_path(path: str) -> str: Normalizes the given path. + _get_root_directory() -> Directory: Returns the root directory + instance, creating it if necessary. + _init_root_child(name: str) -> None: Initializes a root child + directory. + _init_regular(parent_path: str, name: str) -> None: Initializes a + regular directory. + content() -> List[Union[Directory, File]]: Returns the content of the + directory, loading it if necessary. + load() -> Self: Loads the content of the directory and marks it as + loaded. + """ name: str - group: str - year: int - size: int - last_update: datetime - uf: Optional[str] = None - month: Optional[str] = None - disease: Optional[str] = None - - @field_validator("last_update", mode="before") + path: str + parent: "Directory" + loaded: bool + __content__: Dict[str, Union[File, "Directory"]] + + def __new__(cls, path: str, _is_root_child: bool = False) -> "Directory": + normalized_path = os.path.normpath(path) + + # Handle root directory case + if normalized_path == "/": + return cls._get_root_directory() + + # Return cached instance if exists + if normalized_path in DIRECTORY_CACHE: + return DIRECTORY_CACHE[normalized_path] + + # Use os.path.split for reliable path splitting + parent_path, name = os.path.split(normalized_path) + + # Handle empty parent path + if not parent_path: + parent_path = "/" + # Handle parent paths that don't start with / + elif not parent_path.startswith("/"): + parent_path = "/" + parent_path + + # Create new instance + instance = super().__new__(cls) + instance.path = normalized_path + + if _is_root_child: + instance._init_root_child(name) + else: + instance._init_regular(parent_path, name) + + DIRECTORY_CACHE[normalized_path] = instance + return instance + + @staticmethod + def _normalize_path(path: str) -> str: + """Normalizes the given path""" + path = f"/{path}" if not path.startswith("/") else path + return path.removesuffix("/") + @classmethod - def parse_modify_date(cls, v: Union[str, datetime]) -> datetime: - if isinstance(v, datetime): - return v + def _get_root_directory(cls) -> Directory: + """Returns the root directory instance, creating it if necessary""" + if "/" not in DIRECTORY_CACHE: + root = super().__new__(cls) + root.parent = root + root.name = "/" + root.path = "/" + root.loaded = False + root.__content__ = {} + DIRECTORY_CACHE["/"] = root + return DIRECTORY_CACHE["/"] + + def _init_root_child(self, name: str) -> None: + """Initializes a root child directory""" + self.parent = DIRECTORY_CACHE["/"] + self.name = name + self.loaded = False + self.__content__ = {} + + def _init_regular(self, parent_path: str, name: str) -> None: + """Initializes a regular directory""" + self.parent = Directory(parent_path) + self.name = name + self.loaded = False + self.__content__ = {} + + @property + def content(self) -> List[Union[Directory, File]]: + """Returns the content of the directory, loading it if necessary""" + if not self.loaded: + self.load() + return list(self.__content__.values()) + + def load(self) -> Self: + """Loads the content of the directory and marks it as loaded""" + self.__content__ |= load_directory_content(self.path) + self.loaded = True + return self + + def reload(self): + """ + Reloads the content of the Directory + """ + self.loaded = False + return self.load() + + def __str__(self) -> str: + return self.path + + def __repr__(self) -> str: + return self.path + + def __hash__(self): + return hash(self.path) + + def __eq__(self, other): + if isinstance(other, Directory): + return self.path == other.path + return False + + +def load_directory_content(path: str) -> FileContent: + """Directory content loading""" + content: FileContent = {} + + try: + ftp = FTPSingleton.get_instance() + ftp.cwd(path) + path = path.removesuffix("/") + + def line_parser(line: str): + if "" in line: + date, time, _, name = line.strip().split(maxsplit=3) + modify = datetime.strptime(f"{date} {time}", "%m-%d-%y %I:%M%p") + info = {"size": 0, "type": "dir", "modify": modify} + xpath = f"{path}/{name}" + content[name] = Directory(xpath) + else: + date, time, size, name = line.strip().split(maxsplit=3) + modify = datetime.strptime(f"{date} {time}", "%m-%d-%y %I:%M%p") + info: FileInfo = { + "size": size, + "type": "file", + "modify": modify, + } + content[name] = File(path, name, info) + + ftp.retrlines("LIST", line_parser) + except Exception as exc: + raise exc + finally: + FTPSingleton.close() + + to_remove = [ + name + for name in content + if name.upper().endswith(".DBF") + and name.upper().replace(".DBF", ".DBC") in content + ] + + for name in to_remove: + del content[name] + + return content + + +class Database: + """ + Base class for PySUS databases. Contains common functions + for accessing DataSUS FTP server. With this class, it is + possible to construct database classes for different DataSUS + files, sharing state and functionalities. + + Parameters + ftp [FTP]: ftplib.FTP object for connecting in DataSUS server. + name [str]: database name + paths [list[Directory]]: server paths where the files are located + files [list[Files]]: list of parsed Files from Database content + metadata [dict]: dict containing database's metadata information + + Methods + load(): Loads the database paths content to its own content + describe(file): describes a file according to each database's + spec. Returns a dict with file information + format(file): extracts from file name database related info, such as + year, month, UF and/or other useful info for the DB + get_files(Any): filters files using database related format, depending + on the database's files specs + """ + + ftp: FTP + name: str + paths: Tuple[Directory, ...] + metadata: dict + __content__: Dict[str, Union[Directory, File]] + + def __init__(self) -> None: + self.ftp = FTP("ftp.datasus.gov.br") + self.__content__ = {} + + def __repr__(self) -> str: + return f"{self.name} - {self.metadata['long_name']}" + + @property + def content(self) -> List[Union[Directory, File]]: + """ + Lists Database content. The `paths` will be loaded if this property is + called or if explicitly using `load()`. To add specific Directory + inside content, `load()` the directory and call `content` again. + """ + if not self.__content__: + logger.info("content is not loaded, use `load()` to load default paths") + return [] + return sorted(list(self.__content__.values()), key=str) + + @property + def files(self) -> List[File]: + """ + Lists Files inside content. To load a specific Directory inside + content, just `load()` this directory and list files again. + """ + return [f for f in self.content if isinstance(f, File)] + + def load( + self, + directories: Optional[ + Union[Directory, List[Directory], Tuple[Directory, ...]] + ] = None, + ) -> Database: + """ + Loads specific directories to Database content. Will aggregate the + files found within Directories into Database.content. + """ + if not directories: + directories = list(self.paths) + + directories_list = to_list(directories) + + for directory in directories_list: + if not isinstance(directory, Directory): + raise ValueError("Invalid directory provided.") + + directory.load() + self.__content__.update(directory.__content__) + return self + + def describe(self, file: File) -> dict: + """ + Receives a `File` and returns a dict with its information, + according to the database's specifications. This method is + helpful to return the FTP's file in a humanized format + + Parameters + file [File]: a `File` instance + """ + ... + + def format(self, file: File) -> tuple: + """ + Formats a File based on the database specifications, + extracting its name's parameters given a pattern. + + Parameters + file [File]: a `File` instance + """ + ... + + def get_files(self, *args, **kwargs) -> list[File]: + """ + Filters the list of `File`s according to each database file + pattern, as UFs, Groups, Years, Months, etc. This method will + also be responsible to look for wrong values within the file + pattern and possible extra characters in its basename + """ + ... + + def download(self, files: List[File], local_dir: str = CACHEPATH) -> List[str]: + """ + Downloads a list of Files. + """ + files = to_list(files) + pbar = tqdm(total=len(files), dynamic_ncols=True) + dfiles = [] + for file in files: + if isinstance(file, File): + dfiles.append(file.download(local_dir=local_dir, _pbar=pbar)) + pbar.close() + if len(dfiles) == 1: + return dfiles[0] + return dfiles + + async def async_download(self, files: List[File], local_dir: str = CACHEPATH): + """ + Asynchronously downloads a list of files + """ - parsed = dateparser.parse(str(v)) - if parsed: - return parsed + async def download_file(file): + if isinstance(file, File): + await file.async_download(local_dir=local_dir) - return datetime.now() + tasks = [download_file(file) for file in files] + await asyncio.gather(*tasks) diff --git a/pysus/api/models.py b/pysus/api/models.py new file mode 100644 index 00000000..56632e34 --- /dev/null +++ b/pysus/api/models.py @@ -0,0 +1,29 @@ +import dateparser +from pydantic import BaseModel, ConfigDict, field_validator +from typing import Optional, Union +from datetime import datetime + + +class FileDescription(BaseModel): + model_config = ConfigDict(coerce_numbers_to_str=True) + + name: str + group: str + year: int + size: int + last_update: datetime + uf: Optional[str] = None + month: Optional[str] = None + disease: Optional[str] = None + + @field_validator("last_update", mode="before") + @classmethod + def parse_modify_date(cls, v: Union[str, datetime]) -> datetime: + if isinstance(v, datetime): + return v + + parsed = dateparser.parse(str(v)) + if parsed: + return parsed + + return datetime.now() diff --git a/pysus/management/ingest.py b/pysus/management/ingest.py new file mode 100644 index 00000000..fc72c3d2 --- /dev/null +++ b/pysus/management/ingest.py @@ -0,0 +1,116 @@ +import requests +from typing import Literal, List +from pathlib import Path + +import boto3 +import duckdb +from sqlalchemy.orm import sessionmaker +from sqlalchemy import create_engine +from botocore.config import Config + +from pysus import CACHEPATH +from pysus.api.ducklake.models import Dataset, DatasetGroup, File, DatasetMetadata +from pysus.api.ftp import File as FTPFile +from pysus.api.dadosgov.models import Resource + + +class S3Client: + def __init__(self, access_key: str, secret_key: str): + self.access_key = access_key + self.secret_key = secret_key + self.bucket = "pysus" + self.endpoint = "nbg1.your-objectstorage.com" + self.catalog_local = CACHEPATH / "catalog.db" + self.catalog_remote = "public/catalog.db" + + self.s3 = boto3.client( + "s3", + endpoint_url=f"https://{self.endpoint}", + aws_access_key_id=access_key, + aws_secret_access_key=secret_key, + region_name="nbg1", + config=Config(signature_version="s3v4"), + ) + self.db = None + + def __enter__(self): + self.download_catalog() + self.db = duckdb.connect() + self._configure_duckdb() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.db: + self.db.close() + if exc_type is None: + self.upload_catalog() + + @property + def catalog_url(self) -> str: + return f"https://{self.endpoint}/{self.bucket}/{self.catalog_remote}" + + def _configure_duckdb(self): + self.db.execute("INSTALL ducklake; LOAD ducklake;") + self.db.execute(f""" + SET s3_endpoint='{self.endpoint}'; + SET s3_region='nbg1'; + SET s3_url_style='path'; + SET s3_use_ssl=true; + SET s3_access_key_id='{self.access_key}'; + SET s3_secret_access_key='{self.secret_key}'; + """) + self.db.execute(f"ATTACH 'ducklake:{self.catalog_local}' AS pysus;") + self.db.execute("USE pysus;") + + def download_catalog(self): + self.catalog_local.parent.mkdir(parents=True, exist_ok=True) + try: + r = requests.get(self.catalog_url) + r.raise_for_status() + with self.catalog_local.open("wb") as f: + f.write(r.content) + except requests.exceptions.RequestException: + pass + + def upload_catalog(self): + self.s3.upload_file( + str(self.catalog_local), + self.bucket, + self.catalog_remote, + ) + + +class Ingestor: + def __init__( + self, + client: S3Client, + ): + self.client = client + self.session = sessionmaker( + bind=create_engine(f"duckdb:///{client.catalog_local}") + ) + + def ingest( + self, + origin: Literal["ftp", "dadosgov"], + file: FTPFile | Resource, + force: bool = False, + ) -> None: ... + + def bulk_ingest( + self, + origin: Literal["ftp", "dadosgov"], + files: List[FTPFile | Resource], + ) -> None: ... + + def _ftp_ingest(self, file: FTPFile) -> None: ... + + def _dadosgov_ingest(self, file: Resource) -> None: ... + + def _should_insert(self, file: FTPFile | Resource) -> bool: ... + + def _download_file(self, file: FTPFile | Resource) -> Path: ... + + def _extract_metadata(self, file: FTPFile | Resource) -> File: ... + + def _upload_parquet(self, parquet: Path, metadata: File) -> None: ... diff --git a/pysus/management/utils.py b/pysus/management/utils.py new file mode 100644 index 00000000..cbe14e9b --- /dev/null +++ b/pysus/management/utils.py @@ -0,0 +1,16 @@ +import duckdb +from pathlib import Path + + +def csv_to_parquet(csv_file: Path) -> Path: + parquet = csv_file.with_suffix(".parquet") + con = duckdb.connect() + con.execute(f""" + COPY ( + SELECT * + FROM read_csv_auto('{csv_file}') + ) + TO '{parquet}' + (FORMAT PARQUET) + """) + return parquet