This commit is contained in:
2025-10-20 12:50:03 +02:00
commit 561a91d023
13 changed files with 396 additions and 0 deletions

2
.env (example) Normal file
View File

@@ -0,0 +1,2 @@
GOOGLE_DEVELOPER_KEY=key
GOOGLE_CSE_ID=key

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
venv
.env

6
Dockerfile Normal file
View File

@@ -0,0 +1,6 @@
FROM python:3.11-slim
WORKDIR /app
COPY . /app
RUN pip install --no-cache-dir -r requirements.txt
ENV FLASK_APP=app.py
CMD ["python", "app.py"]

Binary file not shown.

Binary file not shown.

47
app.py Normal file
View File

@@ -0,0 +1,47 @@
from flask import Flask, render_template, request, send_file, jsonify, make_response
from scraper import get_google_first_page
import io, json, csv, yaml, os
from dotenv import load_dotenv
load_dotenv()
app = Flask(__name__)
@app.route("/", methods=["GET"])
def index():
return render_template("index.html", results=[])
@app.route("/search", methods=["POST"])
def search():
query = request.form.get("q", "").strip()
if not query:
return render_template("index.html", error="Zadejte dotaz.")
try:
results = get_google_first_page(query) # list of dicts
except Exception as e:
return render_template("index.html", error=f"Vyhledávání selhalo: {e}")
notice = None
if not results:
notice = (
"Nebyly nalezeny žádné výsledky/dosáhli jste limitu (5 sekund mezi požadavky)."
)
return render_template("results.html", query=query, results=results, notice=notice)
@app.route("/export", methods=["POST"])
def export():
data = request.get_json()
ext = data.get("format", "json")
results = data.get("results", [])
filename = f"results.{ext}"
if ext == "json":
buf = io.BytesIO(json.dumps(results, ensure_ascii=False, indent=2).encode("utf-8"))
return send_file(buf, as_attachment=True, download_name=filename, mimetype="application/json")
else:
return jsonify({"error": "unsupported format"}), 400
if __name__ == "__main__":
port = int(os.environ.get("PORT", 5000))
app.run(host="0.0.0.0", port=port, debug=True)

17
docker-compose.yml Normal file
View File

@@ -0,0 +1,17 @@
version: "3.8"
services:
web:
build: .
ports:
- "8050:5000"
environment:
- FLASK_ENV=development
- SERPAPI_KEY=${SERPAPI_KEY:-}
volumes:
- .:/app
healthcheck:
test: ["CMD-SHELL", "pytest -q tests/test_scraper.py::test_get_google_first_page_maps_output || exit 1"]
interval: 1m
timeout: 20s
retries: 3
start_period: 20s

5
requirements.txt Normal file
View File

@@ -0,0 +1,5 @@
Flask
requests
PyYAML
python-dotenv
pytest

102
scraper.py Normal file
View File

@@ -0,0 +1,102 @@
import os
import time
from urllib.parse import urlparse
import requests
# --- Configurace ---
API_ENDPOINT = "https://www.googleapis.com/customsearch/v1"
DEFAULT_LOCALE = "cs" # language for results
DEFAULT_NUM = 10 # how many results to fetch (max 10 per API call)
RATE_SECONDS = 5.0 # wait time between API requests
last_api_call = 0.0 # remember when we last called the API
def _throttle_api():
"""
Pause if the last API call was too recent.
This is a super basic rate limiter: we allow one request every 5 seconds.
It helps to not run through your daily quota too fast.
"""
global last_api_call
now = time.time()
wait = RATE_SECONDS - (now - last_api_call)
if wait > 0:
time.sleep(wait)
last_api_call = time.time()
def favicon_from_link(link):
try:
u = urlparse(link)
scheme = u.scheme or "https"
netloc = u.netloc
if not netloc and u.path:
# Handle URLs without a scheme like: example.com/path
netloc = u.path.split("/")[0]
if not netloc:
return ""
return f"{scheme}://{netloc}/favicon.ico"
except Exception:
return ""
def _cse_request(q, num=DEFAULT_NUM, hl=DEFAULT_LOCALE):
"""Call the Custom Search API and return the JSON data.
Requires two env variables in your .env file:
- GOOGLE_DEVELOPER_KEY (your API key)
- GOOGLE_CSE_ID (your search engine ID)
"""
api_key = os.environ.get("GOOGLE_DEVELOPER_KEY")
cse_id = os.environ.get("GOOGLE_CSE_ID")
if not api_key:
raise RuntimeError("GOOGLE_DEVELOPER_KEY není nastaven v .env")
if not cse_id:
raise RuntimeError("GOOGLE_CSE_ID (Programmable Search Engine ID) není nastaven v .env")
_throttle_api()
params = {
"key": api_key,
"cx": cse_id,
"q": q,
"num": min(max(num, 1), 10), # API allows up to 10 per call
"hl": hl,
"safe": "off",
}
resp = requests.get(API_ENDPOINT, params=params, timeout=15)
if resp.status_code != 200:
# Try to extract a nice error message from API response
try:
data = resp.json()
msg = data.get("error", {}).get("message") or resp.text
except Exception:
msg = resp.text
raise RuntimeError(f"Google CSE API chyba ({resp.status_code}): {msg}")
return resp.json()
def get_google_first_page(query):
"""Return a list of results for a query using the Custom Search API.
Each item has: position, title, link, snippet, icon
"""
data = _cse_request(query, num=DEFAULT_NUM, hl=DEFAULT_LOCALE)
items = data.get("items", []) or []
results = []
for i, item in enumerate(items, start=1):
link = item.get("link") or item.get("formattedUrl") or ""
title = item.get("title") or item.get("htmlTitle") or ""
snippet = item.get("snippet") or item.get("htmlSnippet") or ""
results.append({
"position": i,
"title": title,
"link": link,
"snippet": snippet,
"icon": favicon_from_link(link),
})
return results

47
templates/index.html Normal file
View File

@@ -0,0 +1,47 @@
<!doctype html>
<html lang="cs">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Vyhledávání Google — Inzio</title>
<!-- Bootstrap 5 CSS -->
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous">
</head>
<body class="bg-light d-flex min-vh-100">
<main class="container my-auto py-5">
<div class="row justify-content-center">
<div class="col-12 col-md-10 col-lg-8">
<div class="text-center mb-4">
<h1 class="fw-semibold">Inzio Web Search</h1>
<p class="text-muted mb-0">Jednoduché vyhledávání na Googlu</p>
</div>
{% if error %}
<div class="alert alert-danger" role="alert">
{{ error }}
</div>
{% endif %}
<div class="card shadow-sm">
<div class="card-body p-4">
<form action="/search" method="POST" class="d-flex flex-column gap-3">
<div>
<label for="q" class="form-label">Hledaný dotaz</label>
<input type="text" class="form-control form-control-lg" id="q" name="q" placeholder="Zadejte dotaz…" required>
</div>
<div class="d-grid d-sm-flex gap-2">
<button type="submit" class="btn btn-primary btn-lg">Hledat</button>
</div>
</form>
</div>
<div class="card-footer bg-white text-muted small">
Výsledky poskytuje <a href="https://developers.google.com/custom-search/v1/overview#search_engine_id" target="_blank" rel="noopener">Google Custom Search JSON API</a> · limit: 1 požadavek / 5&nbsp;s
</div>
</div>
</div>
</div>
</main>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js" integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz" crossorigin="anonymous"></script>
</body>
</html>

90
templates/results.html Normal file
View File

@@ -0,0 +1,90 @@
<!doctype html>
<html lang="cs">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Výsledky pro "{{ query }}" - vontor.cz</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous">
<script>
function download(fmt) {
const payload = { format: fmt, results: JSON.parse(document.getElementById('json-data').textContent) };
fetch('/export', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify(payload)
}).then(r => {
if (r.ok) return r.blob();
return r.json().then(j=>{throw j});
}).then(blob=>{
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'results.' + fmt;
document.body.appendChild(a);
a.click();
a.remove();
window.URL.revokeObjectURL(url);
}).catch(e=>{ alert('Export failed: ' + JSON.stringify(e));});
}
</script>
</head>
<body class="bg-light min-vh-100 d-flex">
<main class="container my-auto py-4">
<div class="d-flex justify-content-between align-items-center mb-3">
<h1 class="h3 mb-0">Výsledky pro "{{ query }}"</h1>
<a class="btn btn-outline-secondary" href="/">Nové hledání</a>
</div>
{% if notice %}
<div class="alert alert-warning" role="alert">
{{ notice }}
</div>
{% endif %}
<div class="card shadow-sm mb-3">
<div class="card-body">
<div class="d-flex flex-wrap gap-2 justify-content-between align-items-center">
<div class="text-muted">Počet výsledků: <strong>{{ results|length }}</strong></div>
<div class="d-flex gap-2">
<button class="btn btn-sm btn-outline-primary" onclick="download('json')">Stáhnout JSON</button>
</div>
</div>
</div>
</div>
{% if results and results|length > 0 %}
<ol class="list-group list-group-numbered">
{% for item in results %}
<li class="list-group-item">
<div class="d-flex gap-3 align-items-start">
{% if item.icon %}
<img src="{{ item.icon }}" alt="" width="20" height="20" class="rounded mt-1" onerror="this.style.display='none'">
{% endif %}
<div class="flex-grow-1">
<a class="fw-semibold" href="{{ item.link }}" target="_blank" rel="noopener noreferrer">{{ item.title }}</a>
{% if r.snippet %}
<small class="text-muted d-block">{{ r.snippet }}</small>
{% endif %}
<small class="text-break text-secondary">{{ r.link }}</small>
</div>
</div>
</li>
{% endfor %}
</ol>
{% else %}
<div class="alert alert-info" role="alert">
Žádné položky k zobrazení.
</div>
{% endif %}
<script id="json-data" type="application/json">{{ results|tojson }}</script>
</main>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js" integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz" crossorigin="anonymous"></script>
</body>
</html>

78
tests/test_scraper.py Normal file
View File

@@ -0,0 +1,78 @@
import os
import json
import types
import pytest
import scraper
def env_setup_test(monkeypatch):
"""Set required env vars for the API client (helper used by tests)."""
monkeypatch.setenv("GOOGLE_DEVELOPER_KEY", "test-key")
monkeypatch.setenv("GOOGLE_CSE_ID", "test-cse")
def mock_api_response_test(monkeypatch):
"""Mock out requests.get to return a fixed API payload (helper)."""
class FakeResp:
status_code = 200
def __init__(self, data):
self._data = data
self.text = json.dumps(data)
def json(self):
return self._data
data = {
"items": [
{
"title": "Example Domain",
"link": "https://example.com/",
"snippet": "This domain is for use in illustrative examples.",
},
{
"title": "OpenAI",
"link": "https://openai.com/research",
"snippet": "Research from OpenAI.",
},
]
}
def fake_get(url, params=None, timeout=15):
return FakeResp(data)
monkeypatch.setattr(scraper, "requests", types.SimpleNamespace(get=fake_get))
def test_get_google_first_page_maps_output(monkeypatch):
env_setup_test(monkeypatch)
mock_api_response_test(monkeypatch)
# Avoid waiting for the throttle in tests
monkeypatch.setattr(scraper, "RATE_SECONDS", 0)
monkeypatch.setattr(scraper, "last_api_call", 0)
results = scraper.get_google_first_page("example query")
assert isinstance(results, list)
assert len(results) == 2
first = results[0]
assert first["position"] == 1
assert first["title"] == "Example Domain"
assert first["link"] == "https://example.com/"
assert first["snippet"].startswith("This domain is for use")
assert first["icon"] == "https://example.com/favicon.ico"
second = results[1]
assert second["position"] == 2
assert second["title"] == "OpenAI"
assert second["link"] == "https://openai.com/research"
assert second["icon"] == "https://openai.com/favicon.ico"
def test_missing_env_raises(monkeypatch):
# Unset env to simulate missing configuration
monkeypatch.delenv("GOOGLE_DEVELOPER_KEY", raising=False)
monkeypatch.delenv("GOOGLE_CSE_ID", raising=False)
with pytest.raises(RuntimeError):
scraper.get_google_first_page("anything")