Files
inizio-web-search/scraper.py
2025-10-20 13:49:38 +02:00

100 lines
3.0 KiB
Python

import os
import time
from urllib.parse import urlparse
import requests
# --- Configurace ---
API_ENDPOINT = "https://www.googleapis.com/customsearch/v1"
DEFAULT_LOCALE = "cs" # language for results
DEFAULT_NUM = 10 # how many results to fetch (max 10 per API call)
RATE_SECONDS = 5.0 # wait time between API requests
last_api_call = 0.0 # remember when we last called the API
def _throttle_api():
"""
Pause if the last API call was too recent.
"""
global last_api_call
now = time.time()
wait = RATE_SECONDS - (now - last_api_call)
if wait > 0:
time.sleep(wait)
last_api_call = time.time()
def favicon_from_link(link):
try:
u = urlparse(link)
scheme = u.scheme or "https"
netloc = u.netloc
if not netloc and u.path:
# Handle URLs without a scheme like: example.com/path
netloc = u.path.split("/")[0]
if not netloc:
return ""
return f"{scheme}://{netloc}/favicon.ico"
except Exception:
return ""
def _cse_request(q, num=DEFAULT_NUM, hl=DEFAULT_LOCALE):
"""Call the Custom Search API and return the JSON data.
Requires two env variables in your .env file:
- GOOGLE_DEVELOPER_KEY (your API key)
- GOOGLE_CSE_ID (your search engine ID)
"""
api_key = (os.environ.get("GOOGLE_DEVELOPER_KEY") or "").strip()
cse_id = (os.environ.get("GOOGLE_CSE_ID") or "").strip()
if not api_key:
raise RuntimeError("GOOGLE_DEVELOPER_KEY není nastaven v .env")
if not cse_id:
raise RuntimeError("GOOGLE_CSE_ID (Programmable Search Engine ID) není nastaven v .env")
_throttle_api()
params = {
"key": api_key,
"cx": cse_id,
"q": q,
"num": min(max(num, 1), 10), # API allows up to 10 per call
"hl": hl,
"safe": "off",
}
resp = requests.get(API_ENDPOINT, params=params, timeout=15)
if resp.status_code != 200:
# Try to extract a nice error message from API response
try:
data = resp.json()
msg = data.get("error", {}).get("message") or resp.text
except Exception:
msg = resp.text
raise RuntimeError(f"Google CSE API chyba ({resp.status_code}): {msg}")
return resp.json()
def get_google_first_page(query):
"""Return a list of results for a query using the Custom Search API.
Each item has: position, title, link, snippet, icon
"""
data = _cse_request(query, num=DEFAULT_NUM, hl=DEFAULT_LOCALE)
items = data.get("items", []) or []
results = []
for i, item in enumerate(items, start=1):
link = item.get("link") or item.get("formattedUrl") or ""
title = item.get("title") or item.get("htmlTitle") or ""
snippet = item.get("snippet") or item.get("htmlSnippet") or ""
results.append({
"position": i,
"title": title,
"link": link,
"snippet": snippet,
"icon": favicon_from_link(link),
})
return results