Skip to content
Merged
Changes from 2 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
5d6a2a2
Update preprocessing_utils.py
Oct 15, 2021
80dcd7a
Update preprocessing_utils.py
Oct 15, 2021
742d2db
Merge branch 'master' into ds_preprocessing_website
Oct 29, 2021
6cd8389
Merge branch 'master' into ds_preprocessing_website
Oct 31, 2021
7aa1797
adding processor for website metadata
Nov 1, 2021
561867a
Create download_wiki_dump.sh
Nov 1, 2021
7a59eba
run style and quality checks
Nov 1, 2021
fce9980
Update website_desc_utils.py
Nov 1, 2021
ab7bd72
adding tokenization for sentence
Nov 1, 2021
60e6620
Update download_wiki_dump.sh
Nov 1, 2021
0299c88
add test
Nov 1, 2021
5a3f0f1
Update preprocessing_utils.py
Nov 1, 2021
2bf8ca2
Update preprocessing_utils.py
Nov 1, 2021
1ef9493
Update test_preprocessing_utils.py
Nov 1, 2021
19ff034
Update test_preprocessing_utils.py
Nov 1, 2021
a6761db
adding tests
Nov 1, 2021
9afa782
fixing a bug in mocking
Nov 1, 2021
46773a9
Update test.yml
Nov 1, 2021
8cd384d
updating name in workflow
Nov 1, 2021
e6e0342
adding nltk to requirements
Nov 1, 2021
a3785e3
Merge branch 'ds_preprocessing_website' into test_ds_preprocess
Nov 1, 2021
a82cd78
Update test_preprocessing_utils.py
Nov 1, 2021
8160bda
Update test_preprocessing_utils.py
Nov 1, 2021
f8c05e2
fixing tests
Nov 1, 2021
7ea07b4
reverting changes from test
Nov 1, 2021
7ef1d9a
fixing quality
Nov 1, 2021
b251109
modifying script and deleting extra file
Nov 2, 2021
ca9c9d0
Update preprocessing_utils.py
Nov 2, 2021
fe9a228
Update download_wiki_dump.sh
Nov 2, 2021
e185fbe
Update preprocessing_utils.py
Nov 2, 2021
21c15c1
Merge branch 'master' into ds_preprocessing_website
Nov 23, 2021
81af40a
addressing PR comments
Nov 23, 2021
1611733
make quality
Nov 23, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions bsmetadata/preprocessing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@
This script provides functions for adding different kinds of metadata to a pretraining corpus.
"""
from abc import ABC, abstractmethod
from collections import defaultdict
from typing import Dict, List, Optional

import requests


class MetadataPreprocessor(ABC):
"""A metadata processor can be used for preprocessing text and adding or extracting metadata information."""
Expand Down Expand Up @@ -52,3 +55,63 @@ def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]:
def _extract_timestamp_from_url(self, url: str) -> Optional:
# This would have to be implemented.
return None


class WebsiteDescPreprocessor(MetadataPreprocessor):
"""Metadata preprocessor for adding website description based on URLs."""

website_description_cache = {}
org_list = ["com", "co", "org", "go", "in"]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reason for using this specific set of top-level domains?


def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]:

metadata_list = examples["metadata"]

# Iterate through the metadata associated with all examples in this batch.
for metadata in metadata_list:
# Get the URL associated with this example.
urls = [md["value"] for md in metadata if md["key"] == "url"]

if not urls:
continue

# Try to extract a website description from the given URL and add it to the metadata.
website_description = self._extract_website_desc_from_url(urls[0])

if website_description:
metadata.append({"key": "timestamp", "type": "global", "value": website_description})
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably be something like "key": "website_description"


return examples

def _extract_website_desc_from_url(self, url: str) -> Optional:

domain = url.str.split("/")[2] # e.g http://www.californialandcan.org/Plumas -> www.californialandcan.org
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would fail for URLs that don't start with http://. I guess all URLs in C4 start with http://, but it would probably be good to be on the safe side here. Also, you may want to consider using some library like urllib (see https://docs.python.org/3/library/urllib.parse.html) for splitting URLs into components as this will take care of all unexpected edge cases for you.

Copy link
Copy Markdown
Collaborator

@tianjianjiang tianjianjiang Oct 15, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pardon me for the intrusion.
I have done some simple steps in #24 just like what @timoschick suggested.
Later I will find a way to put some suggestion code snippets here.

keywords = domain.str.split(".")

keyword = (
keywords[-2]
if len(keywords[-2]) > 3
else keywords[1]
if (keywords[1] not in self.org_list)
else keywords[0]
) # extracting the keyword from domain e.g. www.californialandcan.org -> californialandcan

if keyword not in self.website_description_cache:
self.website_description_cache[keyword] = self.extract_wiki_desc(keyword)

return self.website_description_cache[keyword]

def extract_wiki_desc(self, keyword: str) -> Optional:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As JeanZ has no access to the internet, I think we need to first download all wiki descriptions as a preprocessing step.


keyword = keyword.replace(" ", "_")
r = requests.get(
"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles="
+ keyword
+ "&exintro=&exsentences=2&explaintext=&redirects=&formatversion=2&format=json"
)
page = r.json()

try:
return page["query"]["pages"][0]["extract"]
except:
return None