push
This commit is contained in:
2
.env (example)
Normal file
2
.env (example)
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
GOOGLE_DEVELOPER_KEY=key
|
||||||
|
GOOGLE_CSE_ID=key
|
||||||
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
venv
|
||||||
|
.env
|
||||||
6
Dockerfile
Normal file
6
Dockerfile
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . /app
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
ENV FLASK_APP=app.py
|
||||||
|
CMD ["python", "app.py"]
|
||||||
BIN
__pycache__/scraper.cpython-311.pyc
Normal file
BIN
__pycache__/scraper.cpython-311.pyc
Normal file
Binary file not shown.
BIN
__pycache__/scraper.cpython-313.pyc
Normal file
BIN
__pycache__/scraper.cpython-313.pyc
Normal file
Binary file not shown.
47
app.py
Normal file
47
app.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
from flask import Flask, render_template, request, send_file, jsonify, make_response
|
||||||
|
from scraper import get_google_first_page
|
||||||
|
import io, json, csv, yaml, os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
@app.route("/", methods=["GET"])
|
||||||
|
def index():
|
||||||
|
return render_template("index.html", results=[])
|
||||||
|
|
||||||
|
@app.route("/search", methods=["POST"])
|
||||||
|
def search():
|
||||||
|
query = request.form.get("q", "").strip()
|
||||||
|
if not query:
|
||||||
|
return render_template("index.html", error="Zadejte dotaz.")
|
||||||
|
try:
|
||||||
|
results = get_google_first_page(query) # list of dicts
|
||||||
|
except Exception as e:
|
||||||
|
return render_template("index.html", error=f"Vyhledávání selhalo: {e}")
|
||||||
|
|
||||||
|
notice = None
|
||||||
|
if not results:
|
||||||
|
notice = (
|
||||||
|
"Nebyly nalezeny žádné výsledky/dosáhli jste limitu (5 sekund mezi požadavky)."
|
||||||
|
)
|
||||||
|
return render_template("results.html", query=query, results=results, notice=notice)
|
||||||
|
|
||||||
|
@app.route("/export", methods=["POST"])
|
||||||
|
def export():
|
||||||
|
data = request.get_json()
|
||||||
|
ext = data.get("format", "json")
|
||||||
|
|
||||||
|
results = data.get("results", [])
|
||||||
|
filename = f"results.{ext}"
|
||||||
|
if ext == "json":
|
||||||
|
|
||||||
|
buf = io.BytesIO(json.dumps(results, ensure_ascii=False, indent=2).encode("utf-8"))
|
||||||
|
return send_file(buf, as_attachment=True, download_name=filename, mimetype="application/json")
|
||||||
|
else:
|
||||||
|
return jsonify({"error": "unsupported format"}), 400
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
port = int(os.environ.get("PORT", 5000))
|
||||||
|
app.run(host="0.0.0.0", port=port, debug=True)
|
||||||
17
docker-compose.yml
Normal file
17
docker-compose.yml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
version: "3.8"
|
||||||
|
services:
|
||||||
|
web:
|
||||||
|
build: .
|
||||||
|
ports:
|
||||||
|
- "8050:5000"
|
||||||
|
environment:
|
||||||
|
- FLASK_ENV=development
|
||||||
|
- SERPAPI_KEY=${SERPAPI_KEY:-}
|
||||||
|
volumes:
|
||||||
|
- .:/app
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "pytest -q tests/test_scraper.py::test_get_google_first_page_maps_output || exit 1"]
|
||||||
|
interval: 1m
|
||||||
|
timeout: 20s
|
||||||
|
retries: 3
|
||||||
|
start_period: 20s
|
||||||
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
Flask
|
||||||
|
requests
|
||||||
|
PyYAML
|
||||||
|
python-dotenv
|
||||||
|
pytest
|
||||||
102
scraper.py
Normal file
102
scraper.py
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
# --- Configurace ---
|
||||||
|
API_ENDPOINT = "https://www.googleapis.com/customsearch/v1"
|
||||||
|
DEFAULT_LOCALE = "cs" # language for results
|
||||||
|
DEFAULT_NUM = 10 # how many results to fetch (max 10 per API call)
|
||||||
|
RATE_SECONDS = 5.0 # wait time between API requests
|
||||||
|
last_api_call = 0.0 # remember when we last called the API
|
||||||
|
|
||||||
|
|
||||||
|
def _throttle_api():
|
||||||
|
"""
|
||||||
|
Pause if the last API call was too recent.
|
||||||
|
|
||||||
|
This is a super basic rate limiter: we allow one request every 5 seconds.
|
||||||
|
It helps to not run through your daily quota too fast.
|
||||||
|
"""
|
||||||
|
global last_api_call
|
||||||
|
now = time.time()
|
||||||
|
wait = RATE_SECONDS - (now - last_api_call)
|
||||||
|
|
||||||
|
if wait > 0:
|
||||||
|
time.sleep(wait)
|
||||||
|
last_api_call = time.time()
|
||||||
|
|
||||||
|
|
||||||
|
def favicon_from_link(link):
|
||||||
|
try:
|
||||||
|
u = urlparse(link)
|
||||||
|
scheme = u.scheme or "https"
|
||||||
|
netloc = u.netloc
|
||||||
|
if not netloc and u.path:
|
||||||
|
# Handle URLs without a scheme like: example.com/path
|
||||||
|
netloc = u.path.split("/")[0]
|
||||||
|
if not netloc:
|
||||||
|
return ""
|
||||||
|
return f"{scheme}://{netloc}/favicon.ico"
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _cse_request(q, num=DEFAULT_NUM, hl=DEFAULT_LOCALE):
|
||||||
|
"""Call the Custom Search API and return the JSON data.
|
||||||
|
|
||||||
|
Requires two env variables in your .env file:
|
||||||
|
- GOOGLE_DEVELOPER_KEY (your API key)
|
||||||
|
- GOOGLE_CSE_ID (your search engine ID)
|
||||||
|
"""
|
||||||
|
api_key = os.environ.get("GOOGLE_DEVELOPER_KEY")
|
||||||
|
cse_id = os.environ.get("GOOGLE_CSE_ID")
|
||||||
|
if not api_key:
|
||||||
|
raise RuntimeError("GOOGLE_DEVELOPER_KEY není nastaven v .env")
|
||||||
|
if not cse_id:
|
||||||
|
raise RuntimeError("GOOGLE_CSE_ID (Programmable Search Engine ID) není nastaven v .env")
|
||||||
|
|
||||||
|
_throttle_api()
|
||||||
|
params = {
|
||||||
|
"key": api_key,
|
||||||
|
"cx": cse_id,
|
||||||
|
"q": q,
|
||||||
|
"num": min(max(num, 1), 10), # API allows up to 10 per call
|
||||||
|
"hl": hl,
|
||||||
|
"safe": "off",
|
||||||
|
}
|
||||||
|
resp = requests.get(API_ENDPOINT, params=params, timeout=15)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
# Try to extract a nice error message from API response
|
||||||
|
try:
|
||||||
|
data = resp.json()
|
||||||
|
msg = data.get("error", {}).get("message") or resp.text
|
||||||
|
except Exception:
|
||||||
|
msg = resp.text
|
||||||
|
raise RuntimeError(f"Google CSE API chyba ({resp.status_code}): {msg}")
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
def get_google_first_page(query):
|
||||||
|
"""Return a list of results for a query using the Custom Search API.
|
||||||
|
|
||||||
|
Each item has: position, title, link, snippet, icon
|
||||||
|
"""
|
||||||
|
data = _cse_request(query, num=DEFAULT_NUM, hl=DEFAULT_LOCALE)
|
||||||
|
items = data.get("items", []) or []
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for i, item in enumerate(items, start=1):
|
||||||
|
link = item.get("link") or item.get("formattedUrl") or ""
|
||||||
|
title = item.get("title") or item.get("htmlTitle") or ""
|
||||||
|
snippet = item.get("snippet") or item.get("htmlSnippet") or ""
|
||||||
|
results.append({
|
||||||
|
"position": i,
|
||||||
|
"title": title,
|
||||||
|
"link": link,
|
||||||
|
"snippet": snippet,
|
||||||
|
"icon": favicon_from_link(link),
|
||||||
|
})
|
||||||
|
return results
|
||||||
47
templates/index.html
Normal file
47
templates/index.html
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="cs">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Vyhledávání Google — Inzio</title>
|
||||||
|
<!-- Bootstrap 5 CSS -->
|
||||||
|
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous">
|
||||||
|
</head>
|
||||||
|
<body class="bg-light d-flex min-vh-100">
|
||||||
|
<main class="container my-auto py-5">
|
||||||
|
<div class="row justify-content-center">
|
||||||
|
<div class="col-12 col-md-10 col-lg-8">
|
||||||
|
<div class="text-center mb-4">
|
||||||
|
<h1 class="fw-semibold">Inzio Web Search</h1>
|
||||||
|
<p class="text-muted mb-0">Jednoduché vyhledávání na Googlu</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% if error %}
|
||||||
|
<div class="alert alert-danger" role="alert">
|
||||||
|
{{ error }}
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
<div class="card shadow-sm">
|
||||||
|
<div class="card-body p-4">
|
||||||
|
<form action="/search" method="POST" class="d-flex flex-column gap-3">
|
||||||
|
<div>
|
||||||
|
<label for="q" class="form-label">Hledaný dotaz</label>
|
||||||
|
<input type="text" class="form-control form-control-lg" id="q" name="q" placeholder="Zadejte dotaz…" required>
|
||||||
|
</div>
|
||||||
|
<div class="d-grid d-sm-flex gap-2">
|
||||||
|
<button type="submit" class="btn btn-primary btn-lg">Hledat</button>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
<div class="card-footer bg-white text-muted small">
|
||||||
|
Výsledky poskytuje <a href="https://developers.google.com/custom-search/v1/overview#search_engine_id" target="_blank" rel="noopener">Google Custom Search JSON API</a> · limit: 1 požadavek / 5 s
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js" integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz" crossorigin="anonymous"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
90
templates/results.html
Normal file
90
templates/results.html
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="cs">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Výsledky pro "{{ query }}" - vontor.cz</title>
|
||||||
|
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous">
|
||||||
|
<script>
|
||||||
|
function download(fmt) {
|
||||||
|
const payload = { format: fmt, results: JSON.parse(document.getElementById('json-data').textContent) };
|
||||||
|
fetch('/export', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {'Content-Type': 'application/json'},
|
||||||
|
body: JSON.stringify(payload)
|
||||||
|
}).then(r => {
|
||||||
|
if (r.ok) return r.blob();
|
||||||
|
return r.json().then(j=>{throw j});
|
||||||
|
}).then(blob=>{
|
||||||
|
const url = window.URL.createObjectURL(blob);
|
||||||
|
const a = document.createElement('a');
|
||||||
|
a.href = url;
|
||||||
|
a.download = 'results.' + fmt;
|
||||||
|
document.body.appendChild(a);
|
||||||
|
a.click();
|
||||||
|
a.remove();
|
||||||
|
window.URL.revokeObjectURL(url);
|
||||||
|
}).catch(e=>{ alert('Export failed: ' + JSON.stringify(e));});
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</head>
|
||||||
|
<body class="bg-light min-vh-100 d-flex">
|
||||||
|
<main class="container my-auto py-4">
|
||||||
|
<div class="d-flex justify-content-between align-items-center mb-3">
|
||||||
|
<h1 class="h3 mb-0">Výsledky pro "{{ query }}"</h1>
|
||||||
|
<a class="btn btn-outline-secondary" href="/">Nové hledání</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% if notice %}
|
||||||
|
<div class="alert alert-warning" role="alert">
|
||||||
|
{{ notice }}
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
<div class="card shadow-sm mb-3">
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="d-flex flex-wrap gap-2 justify-content-between align-items-center">
|
||||||
|
<div class="text-muted">Počet výsledků: <strong>{{ results|length }}</strong></div>
|
||||||
|
<div class="d-flex gap-2">
|
||||||
|
<button class="btn btn-sm btn-outline-primary" onclick="download('json')">Stáhnout JSON</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% if results and results|length > 0 %}
|
||||||
|
<ol class="list-group list-group-numbered">
|
||||||
|
|
||||||
|
{% for item in results %}
|
||||||
|
<li class="list-group-item">
|
||||||
|
<div class="d-flex gap-3 align-items-start">
|
||||||
|
{% if item.icon %}
|
||||||
|
<img src="{{ item.icon }}" alt="" width="20" height="20" class="rounded mt-1" onerror="this.style.display='none'">
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
<div class="flex-grow-1">
|
||||||
|
<a class="fw-semibold" href="{{ item.link }}" target="_blank" rel="noopener noreferrer">{{ item.title }}</a>
|
||||||
|
|
||||||
|
{% if r.snippet %}
|
||||||
|
<small class="text-muted d-block">{{ r.snippet }}</small>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
<small class="text-break text-secondary">{{ r.link }}</small>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</li>
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
</ol>
|
||||||
|
{% else %}
|
||||||
|
<div class="alert alert-info" role="alert">
|
||||||
|
Žádné položky k zobrazení.
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
<script id="json-data" type="application/json">{{ results|tojson }}</script>
|
||||||
|
</main>
|
||||||
|
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js" integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz" crossorigin="anonymous"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
BIN
tests/__pycache__/test_scraper.cpython-311-pytest-8.4.2.pyc
Normal file
BIN
tests/__pycache__/test_scraper.cpython-311-pytest-8.4.2.pyc
Normal file
Binary file not shown.
78
tests/test_scraper.py
Normal file
78
tests/test_scraper.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
import types
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import scraper
|
||||||
|
|
||||||
|
|
||||||
|
def env_setup_test(monkeypatch):
|
||||||
|
"""Set required env vars for the API client (helper used by tests)."""
|
||||||
|
monkeypatch.setenv("GOOGLE_DEVELOPER_KEY", "test-key")
|
||||||
|
monkeypatch.setenv("GOOGLE_CSE_ID", "test-cse")
|
||||||
|
|
||||||
|
|
||||||
|
def mock_api_response_test(monkeypatch):
|
||||||
|
"""Mock out requests.get to return a fixed API payload (helper)."""
|
||||||
|
class FakeResp:
|
||||||
|
status_code = 200
|
||||||
|
def __init__(self, data):
|
||||||
|
self._data = data
|
||||||
|
self.text = json.dumps(data)
|
||||||
|
def json(self):
|
||||||
|
return self._data
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"title": "Example Domain",
|
||||||
|
"link": "https://example.com/",
|
||||||
|
"snippet": "This domain is for use in illustrative examples.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "OpenAI",
|
||||||
|
"link": "https://openai.com/research",
|
||||||
|
"snippet": "Research from OpenAI.",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
def fake_get(url, params=None, timeout=15):
|
||||||
|
return FakeResp(data)
|
||||||
|
|
||||||
|
monkeypatch.setattr(scraper, "requests", types.SimpleNamespace(get=fake_get))
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_google_first_page_maps_output(monkeypatch):
|
||||||
|
env_setup_test(monkeypatch)
|
||||||
|
mock_api_response_test(monkeypatch)
|
||||||
|
# Avoid waiting for the throttle in tests
|
||||||
|
monkeypatch.setattr(scraper, "RATE_SECONDS", 0)
|
||||||
|
monkeypatch.setattr(scraper, "last_api_call", 0)
|
||||||
|
|
||||||
|
results = scraper.get_google_first_page("example query")
|
||||||
|
|
||||||
|
assert isinstance(results, list)
|
||||||
|
assert len(results) == 2
|
||||||
|
|
||||||
|
first = results[0]
|
||||||
|
assert first["position"] == 1
|
||||||
|
assert first["title"] == "Example Domain"
|
||||||
|
assert first["link"] == "https://example.com/"
|
||||||
|
assert first["snippet"].startswith("This domain is for use")
|
||||||
|
assert first["icon"] == "https://example.com/favicon.ico"
|
||||||
|
|
||||||
|
second = results[1]
|
||||||
|
assert second["position"] == 2
|
||||||
|
assert second["title"] == "OpenAI"
|
||||||
|
assert second["link"] == "https://openai.com/research"
|
||||||
|
assert second["icon"] == "https://openai.com/favicon.ico"
|
||||||
|
|
||||||
|
|
||||||
|
def test_missing_env_raises(monkeypatch):
|
||||||
|
# Unset env to simulate missing configuration
|
||||||
|
monkeypatch.delenv("GOOGLE_DEVELOPER_KEY", raising=False)
|
||||||
|
monkeypatch.delenv("GOOGLE_CSE_ID", raising=False)
|
||||||
|
|
||||||
|
with pytest.raises(RuntimeError):
|
||||||
|
scraper.get_google_first_page("anything")
|
||||||
Reference in New Issue
Block a user