-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocument_loaders.py
More file actions
67 lines (50 loc) · 1.78 KB
/
document_loaders.py
File metadata and controls
67 lines (50 loc) · 1.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
from typing import List
import trafilatura
def load_pdf(file_path: str) -> str:
from pypdf import PdfReader
reader = PdfReader(file_path)
text_parts = []
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text_parts.append(page_text)
return "\n\n".join(text_parts)
def load_docx(file_path: str) -> str:
from docx import Document
doc = Document(file_path)
text_parts = []
for para in doc.paragraphs:
if para.text.strip():
text_parts.append(para.text)
for table in doc.tables:
for row in table.rows:
row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
if row_text:
text_parts.append(row_text)
return "\n\n".join(text_parts)
def load_csv(file_path: str) -> str:
import pandas as pd
df = pd.read_csv(file_path)
text_parts = []
for idx, row in df.iterrows():
row_text = " | ".join(f"{col}: {val}" for col, val in row.items() if pd.notna(val))
if row_text:
text_parts.append(row_text)
return "\n\n".join(text_parts)
def load_txt(file_path: str) -> str:
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
return f.read()
def load_urls(urls: List[str]) -> str:
text_parts = []
for url in urls:
try:
downloaded = trafilatura.fetch_url(url)
if downloaded:
text = trafilatura.extract(downloaded)
if text:
text_parts.append(f"Source: {url}\n\n{text}")
except Exception as e:
print(f"Error loading URL {url}: {e}")
continue
return "\n\n---\n\n".join(text_parts)