push
This commit is contained in:
2
.env (example)
Normal file
2
.env (example)
Normal file
@@ -0,0 +1,2 @@
|
||||
GOOGLE_DEVELOPER_KEY=key
|
||||
GOOGLE_CSE_ID=key
|
||||
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
venv
|
||||
.env
|
||||
6
Dockerfile
Normal file
6
Dockerfile
Normal file
@@ -0,0 +1,6 @@
|
||||
FROM python:3.11-slim
|
||||
WORKDIR /app
|
||||
COPY . /app
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
ENV FLASK_APP=app.py
|
||||
CMD ["python", "app.py"]
|
||||
BIN
__pycache__/scraper.cpython-311.pyc
Normal file
BIN
__pycache__/scraper.cpython-311.pyc
Normal file
Binary file not shown.
BIN
__pycache__/scraper.cpython-313.pyc
Normal file
BIN
__pycache__/scraper.cpython-313.pyc
Normal file
Binary file not shown.
47
app.py
Normal file
47
app.py
Normal file
@@ -0,0 +1,47 @@
|
||||
from flask import Flask, render_template, request, send_file, jsonify, make_response
|
||||
from scraper import get_google_first_page
|
||||
import io, json, csv, yaml, os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route("/", methods=["GET"])
|
||||
def index():
|
||||
return render_template("index.html", results=[])
|
||||
|
||||
@app.route("/search", methods=["POST"])
|
||||
def search():
|
||||
query = request.form.get("q", "").strip()
|
||||
if not query:
|
||||
return render_template("index.html", error="Zadejte dotaz.")
|
||||
try:
|
||||
results = get_google_first_page(query) # list of dicts
|
||||
except Exception as e:
|
||||
return render_template("index.html", error=f"Vyhledávání selhalo: {e}")
|
||||
|
||||
notice = None
|
||||
if not results:
|
||||
notice = (
|
||||
"Nebyly nalezeny žádné výsledky/dosáhli jste limitu (5 sekund mezi požadavky)."
|
||||
)
|
||||
return render_template("results.html", query=query, results=results, notice=notice)
|
||||
|
||||
@app.route("/export", methods=["POST"])
|
||||
def export():
|
||||
data = request.get_json()
|
||||
ext = data.get("format", "json")
|
||||
|
||||
results = data.get("results", [])
|
||||
filename = f"results.{ext}"
|
||||
if ext == "json":
|
||||
|
||||
buf = io.BytesIO(json.dumps(results, ensure_ascii=False, indent=2).encode("utf-8"))
|
||||
return send_file(buf, as_attachment=True, download_name=filename, mimetype="application/json")
|
||||
else:
|
||||
return jsonify({"error": "unsupported format"}), 400
|
||||
|
||||
if __name__ == "__main__":
|
||||
port = int(os.environ.get("PORT", 5000))
|
||||
app.run(host="0.0.0.0", port=port, debug=True)
|
||||
17
docker-compose.yml
Normal file
17
docker-compose.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
version: "3.8"
|
||||
services:
|
||||
web:
|
||||
build: .
|
||||
ports:
|
||||
- "8050:5000"
|
||||
environment:
|
||||
- FLASK_ENV=development
|
||||
- SERPAPI_KEY=${SERPAPI_KEY:-}
|
||||
volumes:
|
||||
- .:/app
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pytest -q tests/test_scraper.py::test_get_google_first_page_maps_output || exit 1"]
|
||||
interval: 1m
|
||||
timeout: 20s
|
||||
retries: 3
|
||||
start_period: 20s
|
||||
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
Flask
|
||||
requests
|
||||
PyYAML
|
||||
python-dotenv
|
||||
pytest
|
||||
102
scraper.py
Normal file
102
scraper.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import os
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
# --- Configurace ---
|
||||
API_ENDPOINT = "https://www.googleapis.com/customsearch/v1"
|
||||
DEFAULT_LOCALE = "cs" # language for results
|
||||
DEFAULT_NUM = 10 # how many results to fetch (max 10 per API call)
|
||||
RATE_SECONDS = 5.0 # wait time between API requests
|
||||
last_api_call = 0.0 # remember when we last called the API
|
||||
|
||||
|
||||
def _throttle_api():
|
||||
"""
|
||||
Pause if the last API call was too recent.
|
||||
|
||||
This is a super basic rate limiter: we allow one request every 5 seconds.
|
||||
It helps to not run through your daily quota too fast.
|
||||
"""
|
||||
global last_api_call
|
||||
now = time.time()
|
||||
wait = RATE_SECONDS - (now - last_api_call)
|
||||
|
||||
if wait > 0:
|
||||
time.sleep(wait)
|
||||
last_api_call = time.time()
|
||||
|
||||
|
||||
def favicon_from_link(link):
|
||||
try:
|
||||
u = urlparse(link)
|
||||
scheme = u.scheme or "https"
|
||||
netloc = u.netloc
|
||||
if not netloc and u.path:
|
||||
# Handle URLs without a scheme like: example.com/path
|
||||
netloc = u.path.split("/")[0]
|
||||
if not netloc:
|
||||
return ""
|
||||
return f"{scheme}://{netloc}/favicon.ico"
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _cse_request(q, num=DEFAULT_NUM, hl=DEFAULT_LOCALE):
|
||||
"""Call the Custom Search API and return the JSON data.
|
||||
|
||||
Requires two env variables in your .env file:
|
||||
- GOOGLE_DEVELOPER_KEY (your API key)
|
||||
- GOOGLE_CSE_ID (your search engine ID)
|
||||
"""
|
||||
api_key = os.environ.get("GOOGLE_DEVELOPER_KEY")
|
||||
cse_id = os.environ.get("GOOGLE_CSE_ID")
|
||||
if not api_key:
|
||||
raise RuntimeError("GOOGLE_DEVELOPER_KEY není nastaven v .env")
|
||||
if not cse_id:
|
||||
raise RuntimeError("GOOGLE_CSE_ID (Programmable Search Engine ID) není nastaven v .env")
|
||||
|
||||
_throttle_api()
|
||||
params = {
|
||||
"key": api_key,
|
||||
"cx": cse_id,
|
||||
"q": q,
|
||||
"num": min(max(num, 1), 10), # API allows up to 10 per call
|
||||
"hl": hl,
|
||||
"safe": "off",
|
||||
}
|
||||
resp = requests.get(API_ENDPOINT, params=params, timeout=15)
|
||||
if resp.status_code != 200:
|
||||
# Try to extract a nice error message from API response
|
||||
try:
|
||||
data = resp.json()
|
||||
msg = data.get("error", {}).get("message") or resp.text
|
||||
except Exception:
|
||||
msg = resp.text
|
||||
raise RuntimeError(f"Google CSE API chyba ({resp.status_code}): {msg}")
|
||||
return resp.json()
|
||||
|
||||
|
||||
def get_google_first_page(query):
|
||||
"""Return a list of results for a query using the Custom Search API.
|
||||
|
||||
Each item has: position, title, link, snippet, icon
|
||||
"""
|
||||
data = _cse_request(query, num=DEFAULT_NUM, hl=DEFAULT_LOCALE)
|
||||
items = data.get("items", []) or []
|
||||
|
||||
results = []
|
||||
for i, item in enumerate(items, start=1):
|
||||
link = item.get("link") or item.get("formattedUrl") or ""
|
||||
title = item.get("title") or item.get("htmlTitle") or ""
|
||||
snippet = item.get("snippet") or item.get("htmlSnippet") or ""
|
||||
results.append({
|
||||
"position": i,
|
||||
"title": title,
|
||||
"link": link,
|
||||
"snippet": snippet,
|
||||
"icon": favicon_from_link(link),
|
||||
})
|
||||
return results
|
||||
47
templates/index.html
Normal file
47
templates/index.html
Normal file
@@ -0,0 +1,47 @@
|
||||
<!doctype html>
|
||||
<html lang="cs">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>Vyhledávání Google — Inzio</title>
|
||||
<!-- Bootstrap 5 CSS -->
|
||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous">
|
||||
</head>
|
||||
<body class="bg-light d-flex min-vh-100">
|
||||
<main class="container my-auto py-5">
|
||||
<div class="row justify-content-center">
|
||||
<div class="col-12 col-md-10 col-lg-8">
|
||||
<div class="text-center mb-4">
|
||||
<h1 class="fw-semibold">Inzio Web Search</h1>
|
||||
<p class="text-muted mb-0">Jednoduché vyhledávání na Googlu</p>
|
||||
</div>
|
||||
|
||||
{% if error %}
|
||||
<div class="alert alert-danger" role="alert">
|
||||
{{ error }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div class="card shadow-sm">
|
||||
<div class="card-body p-4">
|
||||
<form action="/search" method="POST" class="d-flex flex-column gap-3">
|
||||
<div>
|
||||
<label for="q" class="form-label">Hledaný dotaz</label>
|
||||
<input type="text" class="form-control form-control-lg" id="q" name="q" placeholder="Zadejte dotaz…" required>
|
||||
</div>
|
||||
<div class="d-grid d-sm-flex gap-2">
|
||||
<button type="submit" class="btn btn-primary btn-lg">Hledat</button>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
<div class="card-footer bg-white text-muted small">
|
||||
Výsledky poskytuje <a href="https://developers.google.com/custom-search/v1/overview#search_engine_id" target="_blank" rel="noopener">Google Custom Search JSON API</a> · limit: 1 požadavek / 5 s
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</main>
|
||||
|
||||
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js" integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz" crossorigin="anonymous"></script>
|
||||
</body>
|
||||
</html>
|
||||
90
templates/results.html
Normal file
90
templates/results.html
Normal file
@@ -0,0 +1,90 @@
|
||||
<!doctype html>
|
||||
<html lang="cs">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>Výsledky pro "{{ query }}" - vontor.cz</title>
|
||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous">
|
||||
<script>
|
||||
function download(fmt) {
|
||||
const payload = { format: fmt, results: JSON.parse(document.getElementById('json-data').textContent) };
|
||||
fetch('/export', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify(payload)
|
||||
}).then(r => {
|
||||
if (r.ok) return r.blob();
|
||||
return r.json().then(j=>{throw j});
|
||||
}).then(blob=>{
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = 'results.' + fmt;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
a.remove();
|
||||
window.URL.revokeObjectURL(url);
|
||||
}).catch(e=>{ alert('Export failed: ' + JSON.stringify(e));});
|
||||
}
|
||||
</script>
|
||||
</head>
|
||||
<body class="bg-light min-vh-100 d-flex">
|
||||
<main class="container my-auto py-4">
|
||||
<div class="d-flex justify-content-between align-items-center mb-3">
|
||||
<h1 class="h3 mb-0">Výsledky pro "{{ query }}"</h1>
|
||||
<a class="btn btn-outline-secondary" href="/">Nové hledání</a>
|
||||
</div>
|
||||
|
||||
{% if notice %}
|
||||
<div class="alert alert-warning" role="alert">
|
||||
{{ notice }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div class="card shadow-sm mb-3">
|
||||
<div class="card-body">
|
||||
<div class="d-flex flex-wrap gap-2 justify-content-between align-items-center">
|
||||
<div class="text-muted">Počet výsledků: <strong>{{ results|length }}</strong></div>
|
||||
<div class="d-flex gap-2">
|
||||
<button class="btn btn-sm btn-outline-primary" onclick="download('json')">Stáhnout JSON</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% if results and results|length > 0 %}
|
||||
<ol class="list-group list-group-numbered">
|
||||
|
||||
{% for item in results %}
|
||||
<li class="list-group-item">
|
||||
<div class="d-flex gap-3 align-items-start">
|
||||
{% if item.icon %}
|
||||
<img src="{{ item.icon }}" alt="" width="20" height="20" class="rounded mt-1" onerror="this.style.display='none'">
|
||||
{% endif %}
|
||||
|
||||
<div class="flex-grow-1">
|
||||
<a class="fw-semibold" href="{{ item.link }}" target="_blank" rel="noopener noreferrer">{{ item.title }}</a>
|
||||
|
||||
{% if r.snippet %}
|
||||
<small class="text-muted d-block">{{ r.snippet }}</small>
|
||||
{% endif %}
|
||||
|
||||
<small class="text-break text-secondary">{{ r.link }}</small>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
{% endfor %}
|
||||
|
||||
</ol>
|
||||
{% else %}
|
||||
<div class="alert alert-info" role="alert">
|
||||
Žádné položky k zobrazení.
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<script id="json-data" type="application/json">{{ results|tojson }}</script>
|
||||
</main>
|
||||
|
||||
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js" integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz" crossorigin="anonymous"></script>
|
||||
</body>
|
||||
</html>
|
||||
BIN
tests/__pycache__/test_scraper.cpython-311-pytest-8.4.2.pyc
Normal file
BIN
tests/__pycache__/test_scraper.cpython-311-pytest-8.4.2.pyc
Normal file
Binary file not shown.
78
tests/test_scraper.py
Normal file
78
tests/test_scraper.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import os
|
||||
import json
|
||||
import types
|
||||
import pytest
|
||||
|
||||
import scraper
|
||||
|
||||
|
||||
def env_setup_test(monkeypatch):
|
||||
"""Set required env vars for the API client (helper used by tests)."""
|
||||
monkeypatch.setenv("GOOGLE_DEVELOPER_KEY", "test-key")
|
||||
monkeypatch.setenv("GOOGLE_CSE_ID", "test-cse")
|
||||
|
||||
|
||||
def mock_api_response_test(monkeypatch):
|
||||
"""Mock out requests.get to return a fixed API payload (helper)."""
|
||||
class FakeResp:
|
||||
status_code = 200
|
||||
def __init__(self, data):
|
||||
self._data = data
|
||||
self.text = json.dumps(data)
|
||||
def json(self):
|
||||
return self._data
|
||||
|
||||
data = {
|
||||
"items": [
|
||||
{
|
||||
"title": "Example Domain",
|
||||
"link": "https://example.com/",
|
||||
"snippet": "This domain is for use in illustrative examples.",
|
||||
},
|
||||
{
|
||||
"title": "OpenAI",
|
||||
"link": "https://openai.com/research",
|
||||
"snippet": "Research from OpenAI.",
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
def fake_get(url, params=None, timeout=15):
|
||||
return FakeResp(data)
|
||||
|
||||
monkeypatch.setattr(scraper, "requests", types.SimpleNamespace(get=fake_get))
|
||||
|
||||
|
||||
def test_get_google_first_page_maps_output(monkeypatch):
|
||||
env_setup_test(monkeypatch)
|
||||
mock_api_response_test(monkeypatch)
|
||||
# Avoid waiting for the throttle in tests
|
||||
monkeypatch.setattr(scraper, "RATE_SECONDS", 0)
|
||||
monkeypatch.setattr(scraper, "last_api_call", 0)
|
||||
|
||||
results = scraper.get_google_first_page("example query")
|
||||
|
||||
assert isinstance(results, list)
|
||||
assert len(results) == 2
|
||||
|
||||
first = results[0]
|
||||
assert first["position"] == 1
|
||||
assert first["title"] == "Example Domain"
|
||||
assert first["link"] == "https://example.com/"
|
||||
assert first["snippet"].startswith("This domain is for use")
|
||||
assert first["icon"] == "https://example.com/favicon.ico"
|
||||
|
||||
second = results[1]
|
||||
assert second["position"] == 2
|
||||
assert second["title"] == "OpenAI"
|
||||
assert second["link"] == "https://openai.com/research"
|
||||
assert second["icon"] == "https://openai.com/favicon.ico"
|
||||
|
||||
|
||||
def test_missing_env_raises(monkeypatch):
|
||||
# Unset env to simulate missing configuration
|
||||
monkeypatch.delenv("GOOGLE_DEVELOPER_KEY", raising=False)
|
||||
monkeypatch.delenv("GOOGLE_CSE_ID", raising=False)
|
||||
|
||||
with pytest.raises(RuntimeError):
|
||||
scraper.get_google_first_page("anything")
|
||||
Reference in New Issue
Block a user