From 561a91d023ce09c42e6e67f9bb05d73d2276ad02 Mon Sep 17 00:00:00 2001 From: Brunobrno Date: Mon, 20 Oct 2025 12:50:03 +0200 Subject: [PATCH] push --- .env (example) | 2 + .gitignore | 2 + Dockerfile | 6 ++ __pycache__/scraper.cpython-311.pyc | Bin 0 -> 4605 bytes __pycache__/scraper.cpython-313.pyc | Bin 0 -> 4406 bytes app.py | 47 ++++++++ docker-compose.yml | 17 +++ requirements.txt | 5 + scraper.py | 102 ++++++++++++++++++ templates/index.html | 47 ++++++++ templates/results.html | 90 ++++++++++++++++ .../test_scraper.cpython-311-pytest-8.4.2.pyc | Bin 0 -> 10114 bytes tests/test_scraper.py | 78 ++++++++++++++ 13 files changed, 396 insertions(+) create mode 100644 .env (example) create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 __pycache__/scraper.cpython-311.pyc create mode 100644 __pycache__/scraper.cpython-313.pyc create mode 100644 app.py create mode 100644 docker-compose.yml create mode 100644 requirements.txt create mode 100644 scraper.py create mode 100644 templates/index.html create mode 100644 templates/results.html create mode 100644 tests/__pycache__/test_scraper.cpython-311-pytest-8.4.2.pyc create mode 100644 tests/test_scraper.py diff --git a/.env (example) b/.env (example) new file mode 100644 index 0000000..747be9c --- /dev/null +++ b/.env (example) @@ -0,0 +1,2 @@ +GOOGLE_DEVELOPER_KEY=key +GOOGLE_CSE_ID=key \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2d19ec7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +venv +.env \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..62259f1 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.11-slim +WORKDIR /app +COPY . /app +RUN pip install --no-cache-dir -r requirements.txt +ENV FLASK_APP=app.py +CMD ["python", "app.py"] diff --git a/__pycache__/scraper.cpython-311.pyc b/__pycache__/scraper.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa7ecf6db3a8f50dff32efa729ac2376b5de0dd4 GIT binary patch literal 4605 zcmb7IU2Gf25#A$@zsEmOl*}ZKvx4l{j1>u~)ijpeRI*6fY8=v#WG5EfuqWP0Je$0u z_l}Y*kSH7Efi?u#F%(EZ5`YL0M~zc7El|J@MccGNk*9(T2$VQLfPo?peq-UbF!I#d z4sU6Ey3_^j4<_kiJMoWX5;X&^Qp90xJ8q;W|^iuovHoV^H0(05uA%PC5bT*^pnAG50(`TldKGi z5GA{u#8;Ip!2&O(uHq|MMdJ-!mO#VCP(pne4Y4|_Noi4c%}&lo;&YLySTsDRyQw8{ zSlc+YM7O9ZQOtx)x>;6Mb#|4PG?fm=_*?G?pUI4<3BrRl!Gc34p+tMxphB0~2%^8k%NBBUEMyLKE;9{{OYPAa2$Q@C zTG}C%H|NaiVQ6Uw3c-)pN{@vbOxG-esdE;={8Xz884^tbJBM#mU{eno;{7zPru5~{9l zRTD`OmeVSWT1rXiY=+m8uzs18r0I^xT0+cdk|NV(Pf7A+oyn2_j3a$ORB9NFY2v1$ zdYaDK~kBGQh2F0x6)niL>~3U4fhP*PC(e z!NP8*DjqrcYMck>4|f3WO*34Yt> z8&68@FFoAl0(Hb#{EtY@+fyV(Qa3~E!-o&mASm|sq;s+#ec`cD+V7k^X#Qf50h(wp zC@?wTHbQ0t79R2rR9odyaFp6X(dwg5p(}6X&HvZvu0YAN5N=-0_Gkvd+!;!5GdZ@4 zgYD+fs6EZnnJpQVGv`>yl3c4FA&o9@vp}A-+y*=Ev~3lvo%NhaunG1Lta*#(ZH$HD|rl)=%Ir8$D#zfpXT;W8vSDfl#Lfd0Wo5$4a;) z)T!$peC|$vWvE7Nf8_1*KKKx6=rjv_h)(cZ2hRm0v#pP^W3@bwTL&MuH)nqzefXUf z`hK0+~C#&Pm-&SOy^C zS40`FV5)d_>JfWOg|kFiCVVj537eKRow))q zmrd)YWJ+gMeo4GfqaMJ|3Y#E4t?D)km`YZ={}ml>5eZR$gP_c5CDcqM3Hs#vZw*_W@RcdLgg5EshiUhbf)<=-QI+MRd>Nk z#3Rx0+1N~UUgy9oT2_rG6hYM4ODYIsHE~TNG>StGG6)ZHkdi}GNgIW7YM?ctig8IG zM`$Yxfs)Z#8a^V_z%$BgSx!V^u4;qb#@Dt3W!r%|%R2Vg zdiQM|zjd6-D~4fp{mjMQ7$2N!NN}4+#JOuA|?v)|GxAQUNcZ%fA z@)^^WjbZy!GBR+BqxBR_%U#(t!z`mbv;xAl^Ed3cO@jF|wlnXB(a(N+5j4oNc}vc+ zhkQ#{7Vh7Z)7r(!2sS{Gz2w@vxc8Rp=;EYuZF|afc5&}5m+RuBa_u>Di6)ru6hLEJ z&YH7#pm3MPg6o4WZ!u`&cR4fQgM0lKvkd^?;m80d17!&iLCvN#6)!0S^RV5gM}@2^ z$;(ehktSF~_(T%Jkt2eboufSR%I!Z5w#&q_nHO_giMd6 z77G=ilIOroI4fEpFAKeB(-#Eo=&3Mn&Neiw#_x> z3R!e}OB-?$1{wlMn2KPj8X=pJigYd)Y zQwBhQoCWq-I>g<`lTPFcZd*Pc|NX0F?qr2KS>;YbPICEfoW6eg<{R6K71u!3HBfdP zF3lDq_dGsGqP_dK&+QD9dxt8$L)G4)I&xY#gIu!}nc`^8+k0c}`r5X&;(e~_eXi`C zERBBh%15v4L@R^i)xmLSxJ!y`%@^3R-?DG-tM~@0zJaoDqBQ@>n;*TobGdS8ta@k+ z8tyI@Ev@u-yy8Dp^&cwxr*{Is34R*<{KZOWq8gfjhCiPy+G=+14bOE?E%;Qe@9A0~ zSlf4CH*lyVR02nr*2i@`X>kYluSl#&@rPE%s1gk~WFB8^2fV!-hAuaF_vfUIxrC(??LO^L6PIbeqc zKxzpHq-KVxAxFWuYKSWsR}Br6JAe0(w_sd#JMx_7&-6`tiTcll3>qGZ`l>~Sn7-dH~kB)5@dr2jZbX6d*v9L)#pR!bMReO>@YpkiY<9H)>!Yz4+$DMiAuGH_IQ% z0b_JP&hE^o!4$l_gCry|mk_Bw8Nxu@ zamn!_OITd(;|M3Q#3^xvmz+e9c;b?T8*a&U2oaCuhT1zNO!55=$#V!@V~9`kLZ5$* z5qN!J^NjlRz z|D5GW8~5lu=BOHkBIjx^2&Jn%)tZF|{Xh5!s&~+2wHN9{<8ANdDRjEl(9mQ(6ex)% zxw=d@n52J=Niq^M_%=dvJKC?6uS@PdpF&CIG*fNk{jz%^jki4yZaC%~BD?fQ&lR~~ zC|I4vW=6qT*)Z|&l`))_vst_#8`#u!Oq8^unLQrQL`F3*{A6qt@(RJzvZ1Cikxdw> z=2TN5gLpx~Fjil{x~9O$8wCYaD)S1tiBDrgN$c8-(c>92aYo7JseY{KCME?993XmO zE`x9C1%hW}HG31^DCnka`!EZh^>_%4T3qBT&Qx=XONU~vnwY8rGn#@5it;8fC3h%q${hV`kao-X8IipJDM)f_EMPpR6H zs&_9a)7{mKm@?8th8g$dZ(9Bob<{Mo3bjQ7pfUu1;|x?q^e}|H!Bz2=_{OQ?$gcN5 zsqg(Kw-5JhdwYw~yMlLF{_Wzf%eU&e<+1F=Ga@8e*Mw|7?0Amg{1F@ zriGncF6ey+9YK?guSry2a+)fMMl#vs?CLCYveuY^udtsO%>j~)RNIr(*d*JK?M0AI zi6&`QmynPAltjY4r?dVhIqH5&tP?sPu?4f45KxY!fT3| z)zcQ2m(2|1Mn0>WmOHkXR`RB*Yc!>@s&?IC3M2yKNE=iJ4Hkw%{8Uv-O?_&X=(!Y? zCWom2{0#zC5#4uqo#9)2`OEWNzU*ILW`SHZT@v3H+-_vyQB z?dw<9uC7n5O>G=6w>`aaW4ldUz5w4Vt!idzCD6LxbDm*8MGVwwI{$j1OJk=}5t88%oNAIf3A$20 zDpCVuGik#2QV#VE-pIlxmG zQ!SHp;p_r<>O5WZR^sb#Nrxmzt{z@vuaiu^kCeL zFOH92yc|nKV^?FB$FIZ^sY|h6;gdGSP@cQ4+!XiAjYzT7ShOZk#eT3ervjIaMMaMl zX{OKe*Ltg?3(rDCn#o~J(f;`dtN~5Q^NNP&F?F^uvL{|Qe}(9CM9$^t)as#$HQZ>f z=N`Z*B6RCotUfy{@)irTgM-YX#oPcAEaWUlCTnqqJgeM;wL(CRJRFSVoM8!+PW6I$ z?{hlbsSu(QOUNmPAtmGQCogyn^(m<1!1)@KxpyJmng&Qumn zLUT-XljHCpowPbZofIh1(+TovLrJMKgd&E;L2~3Rj;0G(Gv*8m5){JiQpIs!ijJsx zLC%5zkRCEZ5I^AW01CFOUR$}gs;y{SPxh4q{l$yB{^07XE3dA;w(?qOyzK8PUiiuz z-VGeuI()Gl7~OJ>e&ubko3EDx@|H{fFW2(B@w>tHN+4442X=$uxBG4zrQx@RHXP;P z@k$_63AKIW7Xp0o0-P6ok<~M|&ivlDdG$_W+k37Utq9&7;lQ>4hljFo5@@Tu+asm-x+_~$#}kvp9qi+>fDM;`Jhco@9@ zD7w=h-R_TW^~FAsK5zZpxpg>R4vcTP#w&128CV&3V`z(O|NfzXLI;6bzz!%E*Th>R zKrrqnelS`gGM}Cv=AxbGv(EPDS@yFt4AhStQz%{A-10p{5V(crptSwL~4kTnfFs}n54VV+(B z3Wlo9HAj8`_?Vo|V7N%+a7H!;ab7o6dOYjGrV0n{F3_x9@4|+r=JSf#h3RR}BRclM zf|Cw5$WfRN-2I$@Da0fPp=J5Ni=1ql%1n}E2~BLvX=}CI88eripav|VrrPq-KB;O) zbXu;uC*(W~v=x#lRl!jgM8Sp;%Nvbd7*1ZENX3&cS%G@v^7zQ`8D57!|C*tg2RR`_yzZXln!XO;vBUsaci4btn%2 zPOgABy^tFJf(kIsgM1w&=ler{eqq~t9`ME&S{+>(z5Q}&y6ii)<9lW!QH%-eZL*~T8nXFGg*Li!^{Xp z6w3$q{j56OW8eAhO91JkANy%FMypX;xoAb9#O5b^73@W{gY_Y@^;Y@9btb3J6tc<- zL;`V&ZU)r=2+1&CBKMcb`yX`dYZSciLZLH7U!|q37yyhfdMm*&G()XL|F^uK!QWyZ Z6TW{09qil;JVcl|@ZbUyVOniV{{w9ZvgZH* literal 0 HcmV?d00001 diff --git a/app.py b/app.py new file mode 100644 index 0000000..194ccfe --- /dev/null +++ b/app.py @@ -0,0 +1,47 @@ +from flask import Flask, render_template, request, send_file, jsonify, make_response +from scraper import get_google_first_page +import io, json, csv, yaml, os +from dotenv import load_dotenv + +load_dotenv() + +app = Flask(__name__) + +@app.route("/", methods=["GET"]) +def index(): + return render_template("index.html", results=[]) + +@app.route("/search", methods=["POST"]) +def search(): + query = request.form.get("q", "").strip() + if not query: + return render_template("index.html", error="Zadejte dotaz.") + try: + results = get_google_first_page(query) # list of dicts + except Exception as e: + return render_template("index.html", error=f"Vyhledávání selhalo: {e}") + + notice = None + if not results: + notice = ( + "Nebyly nalezeny žádné výsledky/dosáhli jste limitu (5 sekund mezi požadavky)." + ) + return render_template("results.html", query=query, results=results, notice=notice) + +@app.route("/export", methods=["POST"]) +def export(): + data = request.get_json() + ext = data.get("format", "json") + + results = data.get("results", []) + filename = f"results.{ext}" + if ext == "json": + + buf = io.BytesIO(json.dumps(results, ensure_ascii=False, indent=2).encode("utf-8")) + return send_file(buf, as_attachment=True, download_name=filename, mimetype="application/json") + else: + return jsonify({"error": "unsupported format"}), 400 + +if __name__ == "__main__": + port = int(os.environ.get("PORT", 5000)) + app.run(host="0.0.0.0", port=port, debug=True) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..48bd716 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,17 @@ +version: "3.8" +services: + web: + build: . + ports: + - "8050:5000" + environment: + - FLASK_ENV=development + - SERPAPI_KEY=${SERPAPI_KEY:-} + volumes: + - .:/app + healthcheck: + test: ["CMD-SHELL", "pytest -q tests/test_scraper.py::test_get_google_first_page_maps_output || exit 1"] + interval: 1m + timeout: 20s + retries: 3 + start_period: 20s diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..52e3fc3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +Flask +requests +PyYAML +python-dotenv +pytest \ No newline at end of file diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..e7eb3ef --- /dev/null +++ b/scraper.py @@ -0,0 +1,102 @@ +import os +import time +from urllib.parse import urlparse + +import requests + + +# --- Configurace --- +API_ENDPOINT = "https://www.googleapis.com/customsearch/v1" +DEFAULT_LOCALE = "cs" # language for results +DEFAULT_NUM = 10 # how many results to fetch (max 10 per API call) +RATE_SECONDS = 5.0 # wait time between API requests +last_api_call = 0.0 # remember when we last called the API + + +def _throttle_api(): + """ + Pause if the last API call was too recent. + + This is a super basic rate limiter: we allow one request every 5 seconds. + It helps to not run through your daily quota too fast. + """ + global last_api_call + now = time.time() + wait = RATE_SECONDS - (now - last_api_call) + + if wait > 0: + time.sleep(wait) + last_api_call = time.time() + + +def favicon_from_link(link): + try: + u = urlparse(link) + scheme = u.scheme or "https" + netloc = u.netloc + if not netloc and u.path: + # Handle URLs without a scheme like: example.com/path + netloc = u.path.split("/")[0] + if not netloc: + return "" + return f"{scheme}://{netloc}/favicon.ico" + except Exception: + return "" + + +def _cse_request(q, num=DEFAULT_NUM, hl=DEFAULT_LOCALE): + """Call the Custom Search API and return the JSON data. + + Requires two env variables in your .env file: + - GOOGLE_DEVELOPER_KEY (your API key) + - GOOGLE_CSE_ID (your search engine ID) + """ + api_key = os.environ.get("GOOGLE_DEVELOPER_KEY") + cse_id = os.environ.get("GOOGLE_CSE_ID") + if not api_key: + raise RuntimeError("GOOGLE_DEVELOPER_KEY není nastaven v .env") + if not cse_id: + raise RuntimeError("GOOGLE_CSE_ID (Programmable Search Engine ID) není nastaven v .env") + + _throttle_api() + params = { + "key": api_key, + "cx": cse_id, + "q": q, + "num": min(max(num, 1), 10), # API allows up to 10 per call + "hl": hl, + "safe": "off", + } + resp = requests.get(API_ENDPOINT, params=params, timeout=15) + if resp.status_code != 200: + # Try to extract a nice error message from API response + try: + data = resp.json() + msg = data.get("error", {}).get("message") or resp.text + except Exception: + msg = resp.text + raise RuntimeError(f"Google CSE API chyba ({resp.status_code}): {msg}") + return resp.json() + + +def get_google_first_page(query): + """Return a list of results for a query using the Custom Search API. + + Each item has: position, title, link, snippet, icon + """ + data = _cse_request(query, num=DEFAULT_NUM, hl=DEFAULT_LOCALE) + items = data.get("items", []) or [] + + results = [] + for i, item in enumerate(items, start=1): + link = item.get("link") or item.get("formattedUrl") or "" + title = item.get("title") or item.get("htmlTitle") or "" + snippet = item.get("snippet") or item.get("htmlSnippet") or "" + results.append({ + "position": i, + "title": title, + "link": link, + "snippet": snippet, + "icon": favicon_from_link(link), + }) + return results diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..142e0f0 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,47 @@ + + + + + + Vyhledávání Google — Inzio + + + + +
+
+
+
+

Inzio Web Search

+

Jednoduché vyhledávání na Googlu

+
+ + {% if error %} + + {% endif %} + +
+
+
+
+ + +
+
+ +
+
+
+ +
+
+
+
+ + + + diff --git a/templates/results.html b/templates/results.html new file mode 100644 index 0000000..a20c342 --- /dev/null +++ b/templates/results.html @@ -0,0 +1,90 @@ + + + + + + Výsledky pro "{{ query }}" - vontor.cz + + + + +
+
+

Výsledky pro "{{ query }}"

+ Nové hledání +
+ + {% if notice %} + + {% endif %} + +
+
+
+
Počet výsledků: {{ results|length }}
+
+ +
+
+
+
+ + {% if results and results|length > 0 %} +
    + + {% for item in results %} +
  1. +
    + {% if item.icon %} + + {% endif %} + +
    + {{ item.title }} + + {% if r.snippet %} + {{ r.snippet }} + {% endif %} + + {{ r.link }} +
    +
    +
  2. + {% endfor %} + +
+ {% else %} + + {% endif %} + + +
+ + + + diff --git a/tests/__pycache__/test_scraper.cpython-311-pytest-8.4.2.pyc b/tests/__pycache__/test_scraper.cpython-311-pytest-8.4.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a000e21f6b5c83bde13c77fdf9d0c900534b0f69 GIT binary patch literal 10114 zcmd^FO>Er86()CQf2ox$+18IO*|ej`-n3dTNU*94;)5WQrCK@xUIC|xhm`tbCtRWtsQPX)<6AL;3Kb#j0RX6l_!rXW9 z^5u(fk4fjpel_;?&Ow%)Ib6*5>QgJm+fX^U5W_f@n)PF2aNvVA@RMb6U^ArR1x_y(9yeV_Vo zaX#)M1jHv?z3V^oDd1uypahlBZNCy$B1#mwX5F~DKDjf}zK82t3)7U07wgU=* ziUUuMXP{z9&Zi_z)$_TmuF?)l45f2PIjx^dT#&D+S0U@+-=VmqtIgEcdqMc<0y%GI`x3hB{}&eJ0V> z^pr^upkuM3s9_wLVM$75Q-&lJ`*z!+gsp;&(9v6q+-67D()r~#@3(i}j;;+<+K*S; zkAvXS68tm)AfAXrS5pt{0S1TuEL_=%O}_$Aj4ZtejCL$&5-;CZ`nA$TvvRM+G@;3KyYL{B5>&ED2jaUnIW8IV8wO4sCJU2`l(86krBYhMLiz zG>*;5nS5Fm&*w67D*I&LjA7*U!4oG`CQl@DnG?m6@6DuiQK1kKIGo*J8WU4lF_lgi zVE&Me)OA&~l=MX0Zw8E%kycHTPGzr|Aw8SQ=T*ZDUe2r8(MwMb+V$ihEvINT5b&z3 zC1;BJA-*XmPHDM}$W#*J51YX7q!XWzK%VyZ0bsuH;rw#bf0KA)+Uj`?11cUc{RJ&; z2J^BeXLK`Uq%tauat(c=wE@v_-VjA`anb64DHv}OjQ!$Ij!YYQ9E-uOdJNsu)V4IY zN^X9<{Bb$VqQea1?SfAJyy=IM-67&7#@1N5%NbKG@K?=Wy2S;s7Wyu*aya8QTG z8qKTWShll9mdCsMF74U(bbIRWSDF{+SW%CIx=<=BYJjI>|IeuN zc(2yWbz^%6RKVMMrO}FokkX{=DTPuzjg0UvA|vcA$(zE+nBx}+{DaW{FtgP{Z6Q*M zD9x^PRB3_vC0g=XF+xg2PkZcl#|VKK*^B7<81ZP-{1WwAwgakujO@GZV`D_wue6r@ zmR~}<@Jq;BpEqUsC5pa*Hb;~;k008fTR(Jo{BYp8^}|88A4<_#eC?nI%%q(QK_3UR zW6&E@3qncIb){>E$lhK2?{4;}t9geS1+RU|A#09l*v05+@alR~?$JYM76C@kVYkN= z@i~ngUTJapAy$gj&9Kqu)Gx8R8TQC?>j&=)d-S>WgLl4o;ra9fuEfQ=$IXVTqwkGL z6Ju{&9zQ>!;Ul^kOUt@JmkVKaFkOsUD=;D~Flh6|)3UCsnjyY;JU>4W*Q39+7;%`2 zPR8})DCvvqKc|BJxE_x#n9-D;%EH=IHmRB+Sd}TH4Os9?r*s3BDE#^PlSulI^dlK~ zh^EB(C%)m~IA1)8MV_*217>?|P>DFqY&a9wO@CU=!h*)DP>ME$t%5>NAvsfQbhY4R zgy!d8HpBUxo-$Io>_d2|f{KeRRx9j6p{;QrElN8F2u~?cq4oyW^a7G2NV<_6Me-t$ zV(ZgayNYkSY&mPU4;ZU`PiZnZHUWOr4E;vRn29b>Z=a@4hW27pwby}Qt4n zB}Z7EQ_X12b@4{F-1Q1V1+@Vz4yGEgN0Wis%6gTP&7tL?GU;6H1F|%<+P5xr z0>{n`(pj~6CP4&n6+43=oFrs1$>*mW**1Z@YzO6NF5PfsOw%Q(frc1}Ay)ew6+i&b zYH%o%0G7cS1{&tpa=?sv(6D!9botH2u}xvm%7H5Sl z5+VfH3eByEV;(B6jK1+(zF>S0jyIx!@ITWRM612<#T1yTkh%wx^xb%>jJXu z+#tPGn`aUV0k%SOSpL#EcM6JNNu~fiV;ssPpz4&)@NO*!%osGZb$Pf<`pcdDK$nK$ zx-KBg&JEIEwRt9?5MV1bhvhE~yHijEOELxE8RJkU0ad4ThIeZ@V8)=K9aZ6AnT(W$ zgH@pinPacfWKS8A$6l)lBUNF9N+=~G8)T$P@fFJ0$sSJ<6g{~kEQ#e1Wn%tH}fF2Z{^+P$;AtsLi5T?RiUd)j;=KWDGOa|&FjKZWa+sWCHQ=4%7{er>>Xgp#Zml{MH1yaqbbZh243MSQR%g~R9;|bN^i*x02@wHY zg=VnqCFpd7vDE4eOEEa`PGN2dV4boV-mO)qf=2e>3@}*k!aWK&ya(3>WZAi4?_O{u z6ap!>uYu*`41l{FEOeKZ$#fvgQFcoJ?3B*b8Lb1eq+{rCxeLPyt@x=xMF>DyR9S%v zhPnc^>wh1pENHiXVZ*Oq_{H_nKY=`ZW6d{y?-!|OY^@=RZ#&xkeXh1!!TtTsyVi7? zgKv7c@4z}{o!uWw-5NUGcm$h|>g~gKtQWXGPA4~fbN=_a8$8dw52Yd=H~Nfg_<7?3 z?70bPGQJ;8YgmUFmb3H53?$*3e^623P3Ag$U8YaTW)P9^@wVY=A&VcH=`%2W0N36G zT@63`xwasfZS;9DlhUzx33uqQ0<;+b{sO^&1>)VEbFWY)tzG}Fy9PZol+gUXK2Sonpz-x+6;~qbL z{4{Y*rsLL&Z^QdXU-EzZ5*bZ|zU||I#+%Hr{i>n81rW1CVjfJer+j!xNNDN}4Yn+p z5l4itrc|cAgF3#PPPb2J{gBdBB>1Z1dj0tTljusi6`;1s;z@XjZO}NUWg!>eDfKUa zfG>D{6CJ+8FS1B~o1Cynf16yxBK>W0u|@iOz%?$?-&WAi4{UJ`Y3Ssy@>`sXo@?XJ QZE+462=dTH+X*%MKQ(=-IsgCw literal 0 HcmV?d00001 diff --git a/tests/test_scraper.py b/tests/test_scraper.py new file mode 100644 index 0000000..60388f4 --- /dev/null +++ b/tests/test_scraper.py @@ -0,0 +1,78 @@ +import os +import json +import types +import pytest + +import scraper + + +def env_setup_test(monkeypatch): + """Set required env vars for the API client (helper used by tests).""" + monkeypatch.setenv("GOOGLE_DEVELOPER_KEY", "test-key") + monkeypatch.setenv("GOOGLE_CSE_ID", "test-cse") + + +def mock_api_response_test(monkeypatch): + """Mock out requests.get to return a fixed API payload (helper).""" + class FakeResp: + status_code = 200 + def __init__(self, data): + self._data = data + self.text = json.dumps(data) + def json(self): + return self._data + + data = { + "items": [ + { + "title": "Example Domain", + "link": "https://example.com/", + "snippet": "This domain is for use in illustrative examples.", + }, + { + "title": "OpenAI", + "link": "https://openai.com/research", + "snippet": "Research from OpenAI.", + }, + ] + } + + def fake_get(url, params=None, timeout=15): + return FakeResp(data) + + monkeypatch.setattr(scraper, "requests", types.SimpleNamespace(get=fake_get)) + + +def test_get_google_first_page_maps_output(monkeypatch): + env_setup_test(monkeypatch) + mock_api_response_test(monkeypatch) + # Avoid waiting for the throttle in tests + monkeypatch.setattr(scraper, "RATE_SECONDS", 0) + monkeypatch.setattr(scraper, "last_api_call", 0) + + results = scraper.get_google_first_page("example query") + + assert isinstance(results, list) + assert len(results) == 2 + + first = results[0] + assert first["position"] == 1 + assert first["title"] == "Example Domain" + assert first["link"] == "https://example.com/" + assert first["snippet"].startswith("This domain is for use") + assert first["icon"] == "https://example.com/favicon.ico" + + second = results[1] + assert second["position"] == 2 + assert second["title"] == "OpenAI" + assert second["link"] == "https://openai.com/research" + assert second["icon"] == "https://openai.com/favicon.ico" + + +def test_missing_env_raises(monkeypatch): + # Unset env to simulate missing configuration + monkeypatch.delenv("GOOGLE_DEVELOPER_KEY", raising=False) + monkeypatch.delenv("GOOGLE_CSE_ID", raising=False) + + with pytest.raises(RuntimeError): + scraper.get_google_first_page("anything")