-
Notifications
You must be signed in to change notification settings - Fork 46
Adding the libre textbooks #149
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 20 commits
70a9dd1
c4b61fd
6876820
26b4488
5de34a7
c87307a
0bd8b97
579cc79
41fd816
41b5dfc
9c68bf9
bfcdb4c
8050d7e
f17e326
85f161e
706442d
af2982c
78ff8f7
acd40c3
12f854f
2e0e0fd
2ba93b0
b34267c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,30 @@ | ||||||||
| --- | ||||||||
| name: libre_textbooks | ||||||||
| description: A dataset of scraped articles from libre textbooks | ||||||||
| targets: | ||||||||
| - id: html | ||||||||
| description: A scraped page from libre textbooks | ||||||||
| units: | ||||||||
| type: text | ||||||||
| names: | ||||||||
| - natural language article | ||||||||
| pubchem_aids: [] | ||||||||
| uris: [] | ||||||||
| identifiers: | ||||||||
| - id: 'url ' | ||||||||
| type: string | ||||||||
| description: url of the page the content is scraped from | ||||||||
| - id: text_length | ||||||||
| type: int | ||||||||
| description: text character count | ||||||||
|
Comment on lines
+17
to
+19
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
| license: CC BY 4.0 | ||||||||
| links: | ||||||||
| - name: Libre Textbooks | ||||||||
| url: https://chem.libretexts.org/Bookshelves | ||||||||
| description: '' | ||||||||
| - name: Hugging Face dataset upload | ||||||||
| url: https://huggingface.co/datasets/Hack90/libre_chem_textbooks | ||||||||
| description: Hugging Face dataset uploaded to HF account | ||||||||
| benchmarks: [] | ||||||||
| num_points: 3740 | ||||||||
| bibtex: [] | ||||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,81 @@ | ||||||
| from datasets import load_dataset | ||||||
| import pandas as pd | ||||||
| import yaml | ||||||
|
|
||||||
|
|
||||||
| LINES_TO_REMOVE = "/workspaces/chemnlp/data/libre_textbooks/lines_to_remove.jsonl" | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not used below. Are those lines already removed on the HF dataset upload? |
||||||
| RAW_DATASET = "Hack90/libre_chem_textbooks" | ||||||
|
|
||||||
|
|
||||||
| META_YAML_PATH = "./data/libre_textbooks/meta.yaml" | ||||||
| META_TEMPLATE = { | ||||||
| "name": "libre_textbooks", # unique identifier, we will also use this for directory names | ||||||
| "description": "A dataset of scraped articles from libre textbooks", | ||||||
| "targets": [ | ||||||
| { | ||||||
| "id": "html", # name of the column in a tabular dataset | ||||||
| "description": "A scraped page from libre textbooks", | ||||||
| "units": None, # units of the values in this column (leave empty if unitless) | ||||||
| "type": "string", # can be "categorical", "ordinal", "continuous", "string" | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| "names": [ # names for the property (to sample from for building the prompts) | ||||||
| "natural language article", | ||||||
| ], | ||||||
| "pubchem_aids": [], | ||||||
| "uris": [], | ||||||
| }, | ||||||
| ], | ||||||
| "identifiers": [ | ||||||
| { | ||||||
| "id": "url ", # column name | ||||||
|
kjappelbaum marked this conversation as resolved.
Outdated
|
||||||
| "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "OTHER" | ||||||
| "description": "url of the page the content is scraped from", | ||||||
| }, | ||||||
| { | ||||||
| "id": "text_length", # text character count | ||||||
| "type": "OTHER", # can be "SMILES", "SELFIES", "IUPAC", "OTHER" | ||||||
| "description": "text character count", | ||||||
| }, | ||||||
|
kjappelbaum marked this conversation as resolved.
Outdated
|
||||||
| ], | ||||||
| "license": "CC BY 4.0", # license under which the original dataset was published | ||||||
| "links": [ # list of relevant links (original dataset, other uses, etc.) | ||||||
| { | ||||||
| "name": "Libre Textbooks", | ||||||
| "url": "https://chem.libretexts.org/Bookshelves", | ||||||
| "description": "", | ||||||
| }, | ||||||
| { | ||||||
| "name": "Hugging Face dataset upload", | ||||||
| "url": "https://huggingface.co/datasets/Hack90/libre_chem_textbooks", | ||||||
| "description": "Hugging Face dataset uploaded to HF account", # Hopefully will move this | ||||||
| # to the openbioml space | ||||||
| }, | ||||||
| ], | ||||||
| "benchmarks": [], | ||||||
| "num_points": 3740, # number of datapoints in this dataset | ||||||
| "bibtex": [ | ||||||
| # noqa | ||||||
| ], | ||||||
| } | ||||||
|
|
||||||
|
|
||||||
| def get_raw_data(raw_dataset: str = RAW_DATASET) -> pd.DataFrame: | ||||||
| """Load the raw dataset into a pandas dataframe""" | ||||||
| dataset = load_dataset(raw_dataset) | ||||||
| df_raw = pd.DataFrame(dataset["train"].to_pandas()) | ||||||
| return df_raw | ||||||
|
|
||||||
|
|
||||||
| def create_meta_yaml(num_points: int): | ||||||
| """Create meta configuration file for the dataset""" | ||||||
| # create meta yaml | ||||||
| META_TEMPLATE["num_points"] = num_points | ||||||
| with open(META_YAML_PATH, "w+") as f: | ||||||
| yaml.dump(META_TEMPLATE, f, sort_keys=False) | ||||||
| print(f"Finished processing libre_textbooks {META_TEMPLATE['name']} dataset!") | ||||||
|
|
||||||
|
|
||||||
| if __name__ == "__main__": | ||||||
| num_samples = 0 | ||||||
| raw_df = get_raw_data() | ||||||
| num_samples += len(raw_df) | ||||||
| create_meta_yaml(num_samples) | ||||||
Uh oh!
There was an error while loading. Please reload this page.