|
| 1 | +import dataclasses |
| 2 | +import typing as t |
| 3 | + |
| 4 | +from bs4 import BeautifulSoup |
| 5 | + |
| 6 | +from cratedb_about.util import get_cache_client |
| 7 | + |
| 8 | + |
| 9 | +@dataclasses.dataclass |
| 10 | +class LLMsTxtHubItem: |
| 11 | + name: str |
| 12 | + website: str |
| 13 | + description: str |
| 14 | + tags: t.List[str] = dataclasses.field(default_factory=list) |
| 15 | + resources: t.List[str] = dataclasses.field(default_factory=list) |
| 16 | + # TODO: Parse and add logo. |
| 17 | + |
| 18 | + |
| 19 | + |
| 20 | +class LLMsTxtHub: |
| 21 | + url: str = "https://llmtxt.dev/hub" |
| 22 | + |
| 23 | + def __init__(self): |
| 24 | + self.items: t.List[LLMsTxtHubItem] = [] |
| 25 | + self.client = get_cache_client() |
| 26 | + |
| 27 | + def fetch(self): |
| 28 | + index_html = self.client.get(self.url) |
| 29 | + bs = BeautifulSoup(index_html, "html.parser") |
| 30 | + cards = bs.find_all(attrs={"class": "website-card"}) |
| 31 | + data = [] |
| 32 | + for card in cards: |
| 33 | + divs = card.find(name="div") |
| 34 | + #print("divs:", divs) |
| 35 | + #continue |
| 36 | + #body = next(card.children) |
| 37 | + #body = card.next |
| 38 | + #first = divs.f |
| 39 | + #name = divs.find(attrs={"class": "text-lg"}).text |
| 40 | + name = divs.find(name="h3").text |
| 41 | + #website = divs.find(attrs={"class": "text-sm"}).text |
| 42 | + website = divs.find(name="p").text |
| 43 | + #second = body.find_all(name="div") |
| 44 | + #print("second:", second) |
| 45 | + #description = divs.next.find(attrs={"class": "text-sm"}).text |
| 46 | + description = divs.find_next("div").find(name="p").text |
| 47 | + links = divs.find_all("a") |
| 48 | + hrefs = [] |
| 49 | + for link in links: |
| 50 | + href = link.get("href") |
| 51 | + hrefs.append(href) |
| 52 | + |
| 53 | + item = LLMsTxtHubItem(name=name, website=website, description=description, re) |
| 54 | + data.append(item) |
| 55 | + |
| 56 | + #print("cards:", cards) |
| 57 | + print("data:", data) |
| 58 | + |
| 59 | + return self |
0 commit comments