commit 561a91d023ce09c42e6e67f9bb05d73d2276ad02 Author: Brunobrno Date: Mon Oct 20 12:50:03 2025 +0200 push diff --git a/.env (example) b/.env (example) new file mode 100644 index 0000000..747be9c --- /dev/null +++ b/.env (example) @@ -0,0 +1,2 @@ +GOOGLE_DEVELOPER_KEY=key +GOOGLE_CSE_ID=key \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2d19ec7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +venv +.env \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..62259f1 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.11-slim +WORKDIR /app +COPY . /app +RUN pip install --no-cache-dir -r requirements.txt +ENV FLASK_APP=app.py +CMD ["python", "app.py"] diff --git a/__pycache__/scraper.cpython-311.pyc b/__pycache__/scraper.cpython-311.pyc new file mode 100644 index 0000000..fa7ecf6 Binary files /dev/null and b/__pycache__/scraper.cpython-311.pyc differ diff --git a/__pycache__/scraper.cpython-313.pyc b/__pycache__/scraper.cpython-313.pyc new file mode 100644 index 0000000..afd9586 Binary files /dev/null and b/__pycache__/scraper.cpython-313.pyc differ diff --git a/app.py b/app.py new file mode 100644 index 0000000..194ccfe --- /dev/null +++ b/app.py @@ -0,0 +1,47 @@ +from flask import Flask, render_template, request, send_file, jsonify, make_response +from scraper import get_google_first_page +import io, json, csv, yaml, os +from dotenv import load_dotenv + +load_dotenv() + +app = Flask(__name__) + +@app.route("/", methods=["GET"]) +def index(): + return render_template("index.html", results=[]) + +@app.route("/search", methods=["POST"]) +def search(): + query = request.form.get("q", "").strip() + if not query: + return render_template("index.html", error="Zadejte dotaz.") + try: + results = get_google_first_page(query) # list of dicts + except Exception as e: + return render_template("index.html", error=f"Vyhledávání selhalo: {e}") + + notice = None + if not results: + notice = ( + "Nebyly nalezeny žádné výsledky/dosáhli jste limitu (5 sekund mezi požadavky)." + ) + return render_template("results.html", query=query, results=results, notice=notice) + +@app.route("/export", methods=["POST"]) +def export(): + data = request.get_json() + ext = data.get("format", "json") + + results = data.get("results", []) + filename = f"results.{ext}" + if ext == "json": + + buf = io.BytesIO(json.dumps(results, ensure_ascii=False, indent=2).encode("utf-8")) + return send_file(buf, as_attachment=True, download_name=filename, mimetype="application/json") + else: + return jsonify({"error": "unsupported format"}), 400 + +if __name__ == "__main__": + port = int(os.environ.get("PORT", 5000)) + app.run(host="0.0.0.0", port=port, debug=True) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..48bd716 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,17 @@ +version: "3.8" +services: + web: + build: . + ports: + - "8050:5000" + environment: + - FLASK_ENV=development + - SERPAPI_KEY=${SERPAPI_KEY:-} + volumes: + - .:/app + healthcheck: + test: ["CMD-SHELL", "pytest -q tests/test_scraper.py::test_get_google_first_page_maps_output || exit 1"] + interval: 1m + timeout: 20s + retries: 3 + start_period: 20s diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..52e3fc3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +Flask +requests +PyYAML +python-dotenv +pytest \ No newline at end of file diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..e7eb3ef --- /dev/null +++ b/scraper.py @@ -0,0 +1,102 @@ +import os +import time +from urllib.parse import urlparse + +import requests + + +# --- Configurace --- +API_ENDPOINT = "https://www.googleapis.com/customsearch/v1" +DEFAULT_LOCALE = "cs" # language for results +DEFAULT_NUM = 10 # how many results to fetch (max 10 per API call) +RATE_SECONDS = 5.0 # wait time between API requests +last_api_call = 0.0 # remember when we last called the API + + +def _throttle_api(): + """ + Pause if the last API call was too recent. + + This is a super basic rate limiter: we allow one request every 5 seconds. + It helps to not run through your daily quota too fast. + """ + global last_api_call + now = time.time() + wait = RATE_SECONDS - (now - last_api_call) + + if wait > 0: + time.sleep(wait) + last_api_call = time.time() + + +def favicon_from_link(link): + try: + u = urlparse(link) + scheme = u.scheme or "https" + netloc = u.netloc + if not netloc and u.path: + # Handle URLs without a scheme like: example.com/path + netloc = u.path.split("/")[0] + if not netloc: + return "" + return f"{scheme}://{netloc}/favicon.ico" + except Exception: + return "" + + +def _cse_request(q, num=DEFAULT_NUM, hl=DEFAULT_LOCALE): + """Call the Custom Search API and return the JSON data. + + Requires two env variables in your .env file: + - GOOGLE_DEVELOPER_KEY (your API key) + - GOOGLE_CSE_ID (your search engine ID) + """ + api_key = os.environ.get("GOOGLE_DEVELOPER_KEY") + cse_id = os.environ.get("GOOGLE_CSE_ID") + if not api_key: + raise RuntimeError("GOOGLE_DEVELOPER_KEY není nastaven v .env") + if not cse_id: + raise RuntimeError("GOOGLE_CSE_ID (Programmable Search Engine ID) není nastaven v .env") + + _throttle_api() + params = { + "key": api_key, + "cx": cse_id, + "q": q, + "num": min(max(num, 1), 10), # API allows up to 10 per call + "hl": hl, + "safe": "off", + } + resp = requests.get(API_ENDPOINT, params=params, timeout=15) + if resp.status_code != 200: + # Try to extract a nice error message from API response + try: + data = resp.json() + msg = data.get("error", {}).get("message") or resp.text + except Exception: + msg = resp.text + raise RuntimeError(f"Google CSE API chyba ({resp.status_code}): {msg}") + return resp.json() + + +def get_google_first_page(query): + """Return a list of results for a query using the Custom Search API. + + Each item has: position, title, link, snippet, icon + """ + data = _cse_request(query, num=DEFAULT_NUM, hl=DEFAULT_LOCALE) + items = data.get("items", []) or [] + + results = [] + for i, item in enumerate(items, start=1): + link = item.get("link") or item.get("formattedUrl") or "" + title = item.get("title") or item.get("htmlTitle") or "" + snippet = item.get("snippet") or item.get("htmlSnippet") or "" + results.append({ + "position": i, + "title": title, + "link": link, + "snippet": snippet, + "icon": favicon_from_link(link), + }) + return results diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..142e0f0 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,47 @@ + + + + + + Vyhledávání Google — Inzio + + + + +
+
+
+
+

Inzio Web Search

+

Jednoduché vyhledávání na Googlu

+
+ + {% if error %} + + {% endif %} + +
+
+
+
+ + +
+
+ +
+
+
+ +
+
+
+
+ + + + diff --git a/templates/results.html b/templates/results.html new file mode 100644 index 0000000..a20c342 --- /dev/null +++ b/templates/results.html @@ -0,0 +1,90 @@ + + + + + + Výsledky pro "{{ query }}" - vontor.cz + + + + +
+
+

Výsledky pro "{{ query }}"

+ Nové hledání +
+ + {% if notice %} + + {% endif %} + +
+
+
+
Počet výsledků: {{ results|length }}
+
+ +
+
+
+
+ + {% if results and results|length > 0 %} +
    + + {% for item in results %} +
  1. +
    + {% if item.icon %} + + {% endif %} + +
    + {{ item.title }} + + {% if r.snippet %} + {{ r.snippet }} + {% endif %} + + {{ r.link }} +
    +
    +
  2. + {% endfor %} + +
+ {% else %} + + {% endif %} + + +
+ + + + diff --git a/tests/__pycache__/test_scraper.cpython-311-pytest-8.4.2.pyc b/tests/__pycache__/test_scraper.cpython-311-pytest-8.4.2.pyc new file mode 100644 index 0000000..a000e21 Binary files /dev/null and b/tests/__pycache__/test_scraper.cpython-311-pytest-8.4.2.pyc differ diff --git a/tests/test_scraper.py b/tests/test_scraper.py new file mode 100644 index 0000000..60388f4 --- /dev/null +++ b/tests/test_scraper.py @@ -0,0 +1,78 @@ +import os +import json +import types +import pytest + +import scraper + + +def env_setup_test(monkeypatch): + """Set required env vars for the API client (helper used by tests).""" + monkeypatch.setenv("GOOGLE_DEVELOPER_KEY", "test-key") + monkeypatch.setenv("GOOGLE_CSE_ID", "test-cse") + + +def mock_api_response_test(monkeypatch): + """Mock out requests.get to return a fixed API payload (helper).""" + class FakeResp: + status_code = 200 + def __init__(self, data): + self._data = data + self.text = json.dumps(data) + def json(self): + return self._data + + data = { + "items": [ + { + "title": "Example Domain", + "link": "https://example.com/", + "snippet": "This domain is for use in illustrative examples.", + }, + { + "title": "OpenAI", + "link": "https://openai.com/research", + "snippet": "Research from OpenAI.", + }, + ] + } + + def fake_get(url, params=None, timeout=15): + return FakeResp(data) + + monkeypatch.setattr(scraper, "requests", types.SimpleNamespace(get=fake_get)) + + +def test_get_google_first_page_maps_output(monkeypatch): + env_setup_test(monkeypatch) + mock_api_response_test(monkeypatch) + # Avoid waiting for the throttle in tests + monkeypatch.setattr(scraper, "RATE_SECONDS", 0) + monkeypatch.setattr(scraper, "last_api_call", 0) + + results = scraper.get_google_first_page("example query") + + assert isinstance(results, list) + assert len(results) == 2 + + first = results[0] + assert first["position"] == 1 + assert first["title"] == "Example Domain" + assert first["link"] == "https://example.com/" + assert first["snippet"].startswith("This domain is for use") + assert first["icon"] == "https://example.com/favicon.ico" + + second = results[1] + assert second["position"] == 2 + assert second["title"] == "OpenAI" + assert second["link"] == "https://openai.com/research" + assert second["icon"] == "https://openai.com/favicon.ico" + + +def test_missing_env_raises(monkeypatch): + # Unset env to simulate missing configuration + monkeypatch.delenv("GOOGLE_DEVELOPER_KEY", raising=False) + monkeypatch.delenv("GOOGLE_CSE_ID", raising=False) + + with pytest.raises(RuntimeError): + scraper.get_google_first_page("anything")