Hub: Inquire information from https://llmtxt.dev/hub

amotl · amotl · commit fce13e0a4749 · 2025-05-16T11:05:55.000+02:00
diff --git a/src/cratedb_about/cli.py b/src/cratedb_about/cli.py
@@ -6,6 +6,7 @@
 from pueblo.util.cli import boot_click
 
 from cratedb_about.bundle.llmstxt import LllmsTxtBuilder
+from cratedb_about.hub.model import LLMsTxtHub
 from cratedb_about.outline import CrateDbKnowledgeOutline
 from cratedb_about.query.core import CrateDbKnowledgeConversation
 from cratedb_about.query.model import Example
@@ -99,6 +100,16 @@ def bundle(ctx: click.Context, url: str, format_: str, outdir: Path) -> None:
     logger.info("Ready.")
 
 
+@cli.command()
+@click.pass_context
+def hub(ctx: click.Context) -> None:
+    """
+    Inquire information from https://llmtxt.dev/hub.
+    """
+    LLMsTxtHub().fetch()
+    logger.info("Ready.")
+
+
 @cli.command()
 @click.argument("question", type=str, required=False)
 @click.option("--backend", type=click.Choice(["openai", "claude"]), default="openai")
diff --git a/src/cratedb_about/hub/__init__.py b/src/cratedb_about/hub/__init__.py
diff --git a/src/cratedb_about/hub/model.py b/src/cratedb_about/hub/model.py
@@ -0,0 +1,59 @@
+import dataclasses
+import typing as t
+
+from bs4 import BeautifulSoup
+
+from cratedb_about.util import get_cache_client
+
+
+@dataclasses.dataclass
+class LLMsTxtHubItem:
+    name: str
+    website: str
+    description: str
+    tags: t.List[str] = dataclasses.field(default_factory=list)
+    resources: t.List[str] = dataclasses.field(default_factory=list)
+    # TODO: Parse and add logo.
+
+
+
+class LLMsTxtHub:
+    url: str = "https://llmtxt.dev/hub"
+
+    def __init__(self):
+        self.items: t.List[LLMsTxtHubItem] = []
+        self.client = get_cache_client()
+
+    def fetch(self):
+        index_html = self.client.get(self.url)
+        bs = BeautifulSoup(index_html, "html.parser")
+        cards = bs.find_all(attrs={"class": "website-card"})
+        data = []
+        for card in cards:
+            divs = card.find(name="div")
+            #print("divs:", divs)
+            #continue
+            #body = next(card.children)
+            #body = card.next
+            #first = divs.f
+            #name = divs.find(attrs={"class": "text-lg"}).text
+            name = divs.find(name="h3").text
+            #website = divs.find(attrs={"class": "text-sm"}).text
+            website = divs.find(name="p").text
+            #second = body.find_all(name="div")
+            #print("second:", second)
+            #description = divs.next.find(attrs={"class": "text-sm"}).text
+            description = divs.find_next("div").find(name="p").text
+            links = divs.find_all("a")
+            hrefs = []
+            for link in links:
+                href = link.get("href")
+                hrefs.append(href)
+
+            item = LLMsTxtHubItem(name=name, website=website, description=description, re)
+            data.append(item)
+
+        #print("cards:", cards)
+        print("data:", data)
+
+        return self