Skip to content

Commit a5a0c6a

Browse files
authored
Merge pull request #49 from shanyas10/ds_preprocessing_website
Adding WebsiteMetadataProcessor to preprocessing_utils
2 parents aaacc87 + 1611733 commit a5a0c6a

7 files changed

Lines changed: 182 additions & 0 deletions

File tree

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
2+
3+
out_dir=${1:-bsmetadata/preprocessing_data} # default director: preprocessing_data
4+
5+
## Clone the huggingface dataset repo containing wiki dump
6+
mkdir -p "$out_dir"
7+
HUB_REPO_NAME=bs-modeling-metadata/wiki_dump
8+
git clone https://huggingface.co/datasets/${HUB_REPO_NAME} $out_dir/wiki_dump
9+
10+
11+
## Downloading nltk punkt to be used in sentence tokenizer
12+
python -m nltk.downloader 'punkt'
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import re
2+
from collections import defaultdict
3+
from typing import Optional
4+
5+
import nltk
6+
from wikipedia2vec.dump_db import DumpDB
7+
8+
9+
class WebsiteDescUtils:
10+
def __init__(self, path_wiki_db) -> None:
11+
self.cache = defaultdict(str)
12+
self.wiki_dump_db = DumpDB(path_wiki_db)
13+
self.redirects_map = {
14+
key.lower(): value for key, value in self.wiki_dump_db.redirects()
15+
} # loading all redirect information: takes ~10s
16+
17+
def fetch_wikipedia_title_from_keyword(self, keyword: str) -> str:
18+
title = self.redirects_map.get(
19+
keyword, keyword.split(".")[0].capitalize()
20+
) # fallback to default for cases where domain is not recognized. We'll try to hit the db with the exact keyword directly (e.g. rightmove.com -> Rightmove) Capitalizing since wikipedia titles are so
21+
return title
22+
23+
def fetch_wikipedia_description_for_title(self, title: str) -> Optional:
24+
try:
25+
text = self.wiki_dump_db.get_paragraphs(title)[0].text
26+
text = re.sub(r"\((?:[^)(]|\([^)(]*\))*\)", "", text)
27+
text = nltk.sent_tokenize(text)[0] # Picking the first sentence
28+
except Exception:
29+
return None
30+
return text
31+
32+
def extract_wiki_desc(self, keyword: str) -> Optional:
33+
34+
title = self.fetch_wikipedia_title_from_keyword(keyword)
35+
desc = self.fetch_wikipedia_description_for_title(title)
36+
return desc
37+
38+
def fetch_website_description_from_keyword(self, keyword: str) -> Optional:
39+
if not self.cache[keyword]:
40+
self.cache[keyword] = self.extract_wiki_desc(keyword)
41+
42+
return self.cache[keyword]

bsmetadata/preprocessing_utils.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from REL.ner import load_flair_ner
2424
from REL.utils import process_results
2525

26+
from bsmetadata.preprocessing_tools.website_desc_utils import WebsiteDescUtils
2627
from bsmetadata.vendor.dateutil.src.dateutil.parser import ParserError, parse
2728

2829

@@ -42,6 +43,11 @@ def parse_date(path):
4243
return None
4344

4445

46+
def fetch_keyword_from_url(url: str) -> str: # e.g http://www.californialandcan.org/Plumas -> californialandcan.org
47+
domain = urlsplit(url).netloc
48+
return domain.replace("www.", "")
49+
50+
4551
def remove_improbable_date(x):
4652
if x is not None and (x.year < 1983 or x.year > 2021):
4753
return None
@@ -88,6 +94,38 @@ def _extract_timestamp_from_url(self, url: str) -> Optional[str]:
8894
return date
8995

9096

97+
class WebsiteDescPreprocessor(MetadataPreprocessor):
98+
"""Metadata preprocessor for adding website description based on URLs."""
99+
100+
def __init__(self, path_wiki_db: str = "../preprocessing_data/wiki_dump/wiki_en_dump_db") -> None:
101+
self.website_utils = WebsiteDescUtils(path_wiki_db)
102+
super().__init__()
103+
104+
def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]:
105+
106+
metadata_list = examples["metadata"]
107+
108+
# Iterate through the metadata associated with all examples in this batch.
109+
for metadata in metadata_list:
110+
# Get the URL associated with this example.
111+
urls = [md["value"] for md in metadata if md["key"] == "url"]
112+
113+
if not urls:
114+
continue
115+
116+
# Try to extract a website description from the given URL and add it to the metadata.
117+
website_description = self._extract_website_desc_from_url(urls[0])
118+
119+
if website_description:
120+
metadata.append({"key": "website_description", "type": "global", "value": website_description})
121+
return examples
122+
123+
def _extract_website_desc_from_url(self, url: str) -> Optional:
124+
125+
keyword = fetch_keyword_from_url(url)
126+
return self.website_utils.fetch_website_description_from_keyword(keyword)
127+
128+
91129
class EntityPreprocessor(MetadataPreprocessor):
92130
"""Metadata preprocessor for adding entity information."""
93131

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,5 @@ wandb>=0.10.32,<1 # pip will likely update it to 0.12.1, but it is probably ok
44
transformers>=4.6.0,<5 # pip will likely update it to 4.10.0, but it is probably ok and good for bugfixes.
55
accelerate>=0.4.0,<1 # We may want to use 0.5.0 in the near future
66
datasets[streaming]>=1.11.0,<2
7+
wikipedia2vec==1.0.5
8+
nltk==3.6.5

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,6 @@ def req_file(filename):
2121
install_requires=install_requires,
2222
extras_require={
2323
"entity_preprocessing": ["REL @ git+https://github.com/manandey/REL.git#egg=REL"],
24+
"website_description_preprocessing": ["wikipedia2vec==1.0.5", "nltk==3.6.5"],
2425
},
2526
)

tests/mocks/mock_dump_db.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from typing import List
2+
3+
4+
class MockParagraph:
5+
def __init__(self, text):
6+
self.text = text
7+
8+
9+
class MockDumpDB:
10+
def __init__(self, db_file) -> None:
11+
self.db_file = db_file
12+
self.redirect_info = [("xyz.com", "XYZ"), ("test.com", "Test"), ("test_key", "Test Key")]
13+
self.paragraphs_map = {
14+
"XYZ": [
15+
MockParagraph("XYZ is a U.S. based company."),
16+
MockParagraph("Test paragraph for the key XYZ."),
17+
],
18+
"Test": [
19+
MockParagraph("Test is a U.S. based company."),
20+
MockParagraph("Test paragraph for the key Test."),
21+
],
22+
"Sometitle": [
23+
MockParagraph("SomeTitle is a U.S. based company."),
24+
MockParagraph("Test paragraph for the key SomeTitle."),
25+
],
26+
}
27+
28+
def redirects(self) -> List[tuple]:
29+
return self.redirect_info
30+
31+
def get_paragraphs(self, title: str):
32+
return self.paragraphs_map[title]

tests/test_preprocessing_utils.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import unittest
2+
from unittest import mock
3+
4+
from datasets import Dataset
5+
from mocks.mock_dump_db import MockDumpDB
6+
7+
from bsmetadata.preprocessing_utils import WebsiteDescPreprocessor
8+
9+
10+
def mock_sent_tokenize(text):
11+
return [text]
12+
13+
14+
class WebsiteDescPreprocessorTester(unittest.TestCase):
15+
@mock.patch("bsmetadata.preprocessing_tools.website_desc_utils.DumpDB")
16+
def setUp(self, mock_db) -> None:
17+
mock_db.return_value = MockDumpDB("some/path")
18+
self.website_processor = WebsiteDescPreprocessor()
19+
self.example_ids = [0, 1, 2]
20+
self.example_text = ["test text 1", "test text 2", "test text 3"]
21+
self.example_metadata = [
22+
[{"key": "url", "type": "global", "value": "https://www.xyz.com"}],
23+
[
24+
{"key": "url", "type": "global", "value": "http://sometitle.com"},
25+
{"key": "url", "type": "global", "value": "http://notfound.com"},
26+
],
27+
[{"key": "url", "type": "global", "value": "https://www.test.com"}],
28+
]
29+
30+
self.example_dict = {"id": self.example_ids, "metadata": self.example_metadata, "text": self.example_text}
31+
32+
@mock.patch("bsmetadata.preprocessing_tools.website_desc_utils.nltk.sent_tokenize", new=mock_sent_tokenize)
33+
def test_website_metadata_processor(self):
34+
ds = Dataset.from_dict(self.example_dict)
35+
ds = ds.map(lambda ex: self.website_processor.preprocess(ex), batched=True)
36+
target_metadata = [
37+
[
38+
{"key": "url", "type": "global", "value": "https://www.xyz.com"},
39+
{"key": "website_description", "type": "global", "value": "XYZ is a U.S. based company."},
40+
],
41+
[
42+
{"key": "url", "type": "global", "value": "http://sometitle.com"},
43+
{"key": "url", "type": "global", "value": "http://notfound.com"},
44+
{"key": "website_description", "type": "global", "value": "SomeTitle is a U.S. based company."},
45+
],
46+
[
47+
{"key": "url", "type": "global", "value": "https://www.test.com"},
48+
{"key": "website_description", "type": "global", "value": "Test is a U.S. based company."},
49+
],
50+
]
51+
self.assertEqual(ds[:]["metadata"], target_metadata)
52+
53+
54+
if __name__ == "__main__":
55+
unittest.main()

0 commit comments

Comments
 (0)