Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,708 changes: 2,708 additions & 0 deletions compass/data/rmp_jurisdictions.csv

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions compass/plugin/one_shot/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
OrdinanceExtractionPlugin,
KeywordBasedHeuristic,
)
import compass.utilities.finalize as _finalize_default
import compass.utilities.finalize_rmp as _finalize_rmp
from compass.plugin.one_shot.generators import (
generate_query_templates,
generate_website_keywords,
Expand Down Expand Up @@ -223,6 +225,18 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin):
WEBSITE_KEYWORDS = {} # set by user or LLM-generated
"""dict: Keyword weight mapping for link crawl prioritization"""

@classmethod
def save_structured_data(cls, doc_infos, out_dir):
"""Write extracted data using finalize module from config"""
fin = (
_finalize_rmp
if config.get("finalize") == "rmp"
else _finalize_default
)
db, num_docs_found = fin.doc_infos_to_db(doc_infos)
fin.save_db(db, out_dir)
return num_docs_found

async def get_heuristic(self):
"""Get a `BaseHeuristic` instance with a `check()` method

Expand Down
4 changes: 4 additions & 0 deletions compass/plugin/one_shot/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,8 +404,12 @@ def _to_dataframe(self, data):
full_df = full_df.merge(df, on="feature", how="left")

possible_out_cols = [
"location",
"restriction_type",
"geothermal_applicability",
"value",
"units",
"ammendment",
"summary",
"year",
"section",
Expand Down
24 changes: 23 additions & 1 deletion compass/services/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,8 @@

def _read_pdf_file(pdf_fp, **kwargs):
"""Utility func so that pdftotext.PDF doesn't have to be pickled"""
kwargs.pop("image_to_string_kwargs", None)
kwargs.pop("convert_from_bytes_kwargs", None)
pdf_bytes = Path(pdf_fp).read_bytes()
pages = read_pdf(pdf_bytes, verbose=False)
return PDFDocument(pages, **kwargs), pdf_bytes
Expand All @@ -283,8 +285,16 @@
if tesseract_cmd:
_configure_pytesseract(tesseract_cmd)

image_to_string_kwargs = kwargs.pop("image_to_string_kwargs", None)
convert_from_bytes_kwargs = kwargs.pop("convert_from_bytes_kwargs", None)

pdf_bytes = Path(pdf_fp).read_bytes()
pages = read_pdf_ocr(pdf_bytes, verbose=False)
pages = read_pdf_ocr(
pdf_bytes,
verbose=True,
image_to_string_kwargs=image_to_string_kwargs,
convert_from_bytes_kwargs=convert_from_bytes_kwargs,
)
doc = PDFDocument(_try_decode_ocr_pages(pages), **kwargs)
doc.attrs["from_ocr"] = True
return doc, pdf_bytes
Expand Down Expand Up @@ -366,9 +376,21 @@
def _configure_pytesseract(tesseract_cmd):
"""Set the tesseract_cmd"""
import pytesseract # noqa: PLC0415
from glob import iglob # noqa: PLC0415
from os import remove # noqa: PLC0415

pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

# On Windows, Tesseract may still hold the temp PPM file open when
# pytesseract's cleanup runs, causing WinError 32. Patch cleanup to
# suppress all OSErrors so the OCR result is not lost.
def _cleanup_win(temp_name):
for filename in iglob(f'{temp_name}*' if temp_name else temp_name):

Check failure on line 388 in compass/services/cpu.py

View workflow job for this annotation

GitHub Actions / Lint Python Code Base with Ruff

ruff (Q000)

compass/services/cpu.py:388:31: Q000 Single quotes found but double quotes preferred help: Replace single quotes with double quotes

Check failure on line 388 in compass/services/cpu.py

View workflow job for this annotation

GitHub Actions / Lint Python Code Base with Ruff

ruff (PTH207)

compass/services/cpu.py:388:25: PTH207 Replace `iglob` with `Path.glob` or `Path.rglob`
with contextlib.suppress(OSError):
remove(filename)

Check failure on line 390 in compass/services/cpu.py

View workflow job for this annotation

GitHub Actions / Lint Python Code Base with Ruff

ruff (PTH107)

compass/services/cpu.py:390:17: PTH107 `os.remove()` should be replaced by `Path.unlink()` help: Replace with `Path(...).unlink()`

pytesseract.pytesseract.cleanup = _cleanup_win


def _try_decode_ocr_pages(pages):
"""Try to decode pages into strings"""
Expand Down
Loading
Loading