push

2025-10-20 12:50:03 +02:00
commit 561a91d023
13 changed files with 396 additions and 0 deletions
--- a/scraper.py
+++ b/scraper.py
@@ -0,0 +1,102 @@
+import os
+import time
+from urllib.parse import urlparse
+
+import requests
+
+
+# --- Configurace ---
+API_ENDPOINT = "https://www.googleapis.com/customsearch/v1"
+DEFAULT_LOCALE = "cs"   # language for results
+DEFAULT_NUM = 10         # how many results to fetch (max 10 per API call)
+RATE_SECONDS = 5.0       # wait time between API requests
+last_api_call = 0.0      # remember when we last called the API
+
+
+def _throttle_api():
+    """
+    Pause if the last API call was too recent.
+
+    This is a super basic rate limiter: we allow one request every 5 seconds.
+    It helps to not run through your daily quota too fast.
+    """
+    global last_api_call
+    now = time.time()
+    wait = RATE_SECONDS - (now - last_api_call)
+
+    if wait > 0:
+        time.sleep(wait)
+    last_api_call = time.time()
+
+
+def favicon_from_link(link):
+    try:
+        u = urlparse(link)
+        scheme = u.scheme or "https"
+        netloc = u.netloc
+        if not netloc and u.path:
+            # Handle URLs without a scheme like: example.com/path
+            netloc = u.path.split("/")[0]
+        if not netloc:
+            return ""
+        return f"{scheme}://{netloc}/favicon.ico"
+    except Exception:
+        return ""
+
+
+def _cse_request(q, num=DEFAULT_NUM, hl=DEFAULT_LOCALE):
+    """Call the Custom Search API and return the JSON data.
+
+    Requires two env variables in your .env file:
+      - GOOGLE_DEVELOPER_KEY (your API key)
+      - GOOGLE_CSE_ID (your search engine ID)
+    """
+    api_key = os.environ.get("GOOGLE_DEVELOPER_KEY")
+    cse_id = os.environ.get("GOOGLE_CSE_ID")
+    if not api_key:
+        raise RuntimeError("GOOGLE_DEVELOPER_KEY není nastaven v .env")
+    if not cse_id:
+        raise RuntimeError("GOOGLE_CSE_ID (Programmable Search Engine ID) není nastaven v .env")
+
+    _throttle_api()
+    params = {
+        "key": api_key,
+        "cx": cse_id,
+        "q": q,
+        "num": min(max(num, 1), 10),  # API allows up to 10 per call
+        "hl": hl,
+        "safe": "off",
+    }
+    resp = requests.get(API_ENDPOINT, params=params, timeout=15)
+    if resp.status_code != 200:
+        # Try to extract a nice error message from API response
+        try:
+            data = resp.json()
+            msg = data.get("error", {}).get("message") or resp.text
+        except Exception:
+            msg = resp.text
+        raise RuntimeError(f"Google CSE API chyba ({resp.status_code}): {msg}")
+    return resp.json()
+
+
+def get_google_first_page(query):
+    """Return a list of results for a query using the Custom Search API.
+
+    Each item has: position, title, link, snippet, icon
+    """
+    data = _cse_request(query, num=DEFAULT_NUM, hl=DEFAULT_LOCALE)
+    items = data.get("items", []) or []
+
+    results = []
+    for i, item in enumerate(items, start=1):
+        link = item.get("link") or item.get("formattedUrl") or ""
+        title = item.get("title") or item.get("htmlTitle") or ""
+        snippet = item.get("snippet") or item.get("htmlSnippet") or ""
+        results.append({
+            "position": i,
+            "title": title,
+            "link": link,
+            "snippet": snippet,
+            "icon": favicon_from_link(link),
+        })
+    return results