-
Notifications
You must be signed in to change notification settings - Fork 11
Adding WebsiteMetadataProcessor to preprocessing_utils #49
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
5d6a2a2
80dcd7a
742d2db
6cd8389
7aa1797
561867a
7a59eba
fce9980
ab7bd72
60e6620
0299c88
5a3f0f1
2bf8ca2
1ef9493
19ff034
a6761db
9afa782
46773a9
8cd384d
e6e0342
a3785e3
a82cd78
8160bda
f8c05e2
7ea07b4
7ef1d9a
b251109
ca9c9d0
fe9a228
e185fbe
21c15c1
81af40a
1611733
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,8 +14,11 @@ | |
| This script provides functions for adding different kinds of metadata to a pretraining corpus. | ||
| """ | ||
| from abc import ABC, abstractmethod | ||
| from collections import defaultdict | ||
| from typing import Dict, List, Optional | ||
|
|
||
| import requests | ||
|
|
||
|
|
||
| class MetadataPreprocessor(ABC): | ||
| """A metadata processor can be used for preprocessing text and adding or extracting metadata information.""" | ||
|
|
@@ -52,3 +55,63 @@ def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]: | |
| def _extract_timestamp_from_url(self, url: str) -> Optional: | ||
| # This would have to be implemented. | ||
| return None | ||
|
|
||
|
|
||
| class WebsiteDescPreprocessor(MetadataPreprocessor): | ||
| """Metadata preprocessor for adding website description based on URLs.""" | ||
|
|
||
| website_description_cache = {} | ||
| org_list = ["com", "co", "org", "go", "in"] | ||
|
|
||
| def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]: | ||
|
|
||
| metadata_list = examples["metadata"] | ||
|
|
||
| # Iterate through the metadata associated with all examples in this batch. | ||
| for metadata in metadata_list: | ||
| # Get the URL associated with this example. | ||
| urls = [md["value"] for md in metadata if md["key"] == "url"] | ||
|
|
||
| if not urls: | ||
| continue | ||
|
|
||
| # Try to extract a website description from the given URL and add it to the metadata. | ||
| website_description = self._extract_website_desc_from_url(urls[0]) | ||
|
|
||
| if website_description: | ||
| metadata.append({"key": "timestamp", "type": "global", "value": website_description}) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should probably be something like |
||
|
|
||
| return examples | ||
|
|
||
| def _extract_website_desc_from_url(self, url: str) -> Optional: | ||
|
|
||
| domain = url.str.split("/")[2] # e.g http://www.californialandcan.org/Plumas -> www.californialandcan.org | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would fail for URLs that don't start with
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pardon me for the intrusion. |
||
| keywords = domain.str.split(".") | ||
|
|
||
| keyword = ( | ||
| keywords[-2] | ||
| if len(keywords[-2]) > 3 | ||
| else keywords[1] | ||
| if (keywords[1] not in self.org_list) | ||
| else keywords[0] | ||
| ) # extracting the keyword from domain e.g. www.californialandcan.org -> californialandcan | ||
|
|
||
| if keyword not in self.website_description_cache: | ||
| self.website_description_cache[keyword] = self.extract_wiki_desc(keyword) | ||
|
|
||
| return self.website_description_cache[keyword] | ||
|
|
||
| def extract_wiki_desc(self, keyword: str) -> Optional: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As JeanZ has no access to the internet, I think we need to first download all wiki descriptions as a preprocessing step. |
||
|
|
||
| keyword = keyword.replace(" ", "_") | ||
| r = requests.get( | ||
| "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles=" | ||
| + keyword | ||
| + "&exintro=&exsentences=2&explaintext=&redirects=&formatversion=2&format=json" | ||
| ) | ||
| page = r.json() | ||
|
|
||
| try: | ||
| return page["query"]["pages"][0]["extract"] | ||
| except: | ||
| return None | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the reason for using this specific set of top-level domains?