push
This commit is contained in:
102
scraper.py
Normal file
102
scraper.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import os
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
# --- Configurace ---
|
||||
API_ENDPOINT = "https://www.googleapis.com/customsearch/v1"
|
||||
DEFAULT_LOCALE = "cs" # language for results
|
||||
DEFAULT_NUM = 10 # how many results to fetch (max 10 per API call)
|
||||
RATE_SECONDS = 5.0 # wait time between API requests
|
||||
last_api_call = 0.0 # remember when we last called the API
|
||||
|
||||
|
||||
def _throttle_api():
|
||||
"""
|
||||
Pause if the last API call was too recent.
|
||||
|
||||
This is a super basic rate limiter: we allow one request every 5 seconds.
|
||||
It helps to not run through your daily quota too fast.
|
||||
"""
|
||||
global last_api_call
|
||||
now = time.time()
|
||||
wait = RATE_SECONDS - (now - last_api_call)
|
||||
|
||||
if wait > 0:
|
||||
time.sleep(wait)
|
||||
last_api_call = time.time()
|
||||
|
||||
|
||||
def favicon_from_link(link):
|
||||
try:
|
||||
u = urlparse(link)
|
||||
scheme = u.scheme or "https"
|
||||
netloc = u.netloc
|
||||
if not netloc and u.path:
|
||||
# Handle URLs without a scheme like: example.com/path
|
||||
netloc = u.path.split("/")[0]
|
||||
if not netloc:
|
||||
return ""
|
||||
return f"{scheme}://{netloc}/favicon.ico"
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _cse_request(q, num=DEFAULT_NUM, hl=DEFAULT_LOCALE):
|
||||
"""Call the Custom Search API and return the JSON data.
|
||||
|
||||
Requires two env variables in your .env file:
|
||||
- GOOGLE_DEVELOPER_KEY (your API key)
|
||||
- GOOGLE_CSE_ID (your search engine ID)
|
||||
"""
|
||||
api_key = os.environ.get("GOOGLE_DEVELOPER_KEY")
|
||||
cse_id = os.environ.get("GOOGLE_CSE_ID")
|
||||
if not api_key:
|
||||
raise RuntimeError("GOOGLE_DEVELOPER_KEY není nastaven v .env")
|
||||
if not cse_id:
|
||||
raise RuntimeError("GOOGLE_CSE_ID (Programmable Search Engine ID) není nastaven v .env")
|
||||
|
||||
_throttle_api()
|
||||
params = {
|
||||
"key": api_key,
|
||||
"cx": cse_id,
|
||||
"q": q,
|
||||
"num": min(max(num, 1), 10), # API allows up to 10 per call
|
||||
"hl": hl,
|
||||
"safe": "off",
|
||||
}
|
||||
resp = requests.get(API_ENDPOINT, params=params, timeout=15)
|
||||
if resp.status_code != 200:
|
||||
# Try to extract a nice error message from API response
|
||||
try:
|
||||
data = resp.json()
|
||||
msg = data.get("error", {}).get("message") or resp.text
|
||||
except Exception:
|
||||
msg = resp.text
|
||||
raise RuntimeError(f"Google CSE API chyba ({resp.status_code}): {msg}")
|
||||
return resp.json()
|
||||
|
||||
|
||||
def get_google_first_page(query):
|
||||
"""Return a list of results for a query using the Custom Search API.
|
||||
|
||||
Each item has: position, title, link, snippet, icon
|
||||
"""
|
||||
data = _cse_request(query, num=DEFAULT_NUM, hl=DEFAULT_LOCALE)
|
||||
items = data.get("items", []) or []
|
||||
|
||||
results = []
|
||||
for i, item in enumerate(items, start=1):
|
||||
link = item.get("link") or item.get("formattedUrl") or ""
|
||||
title = item.get("title") or item.get("htmlTitle") or ""
|
||||
snippet = item.get("snippet") or item.get("htmlSnippet") or ""
|
||||
results.append({
|
||||
"position": i,
|
||||
"title": title,
|
||||
"link": link,
|
||||
"snippet": snippet,
|
||||
"icon": favicon_from_link(link),
|
||||
})
|
||||
return results
|
||||
Reference in New Issue
Block a user