diff options
Diffstat (limited to 'folderz-parser.py')
-rw-r--r-- | folderz-parser.py | 80 |
1 files changed, 59 insertions, 21 deletions
diff --git a/folderz-parser.py b/folderz-parser.py index 610833a..c859332 100644 --- a/folderz-parser.py +++ b/folderz-parser.py @@ -1,57 +1,95 @@ import os import requests +import time +import logging -from datetime import datetime, timezone +from datetime import datetime, timezone, timedelta +from sqlalchemy import DateTime, Float, String, create_engine +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, Session from pydantic import BaseModel from google import genai +LOG = logging.getLogger(__name__) + + class ModelProductResponse(BaseModel): name: str store: str price: float + expires_in_days: int current_page: int next_page_url: str -class Product(ModelProductResponse): - date: datetime +class Base(DeclarativeBase): + pass -def main(): - key = os.getenv("API_KEY") - client = genai.Client(api_key=key) +class Discount(Base): + __tablename__ = "discount" - base_url = "https://www.folderz.nl" - init_url = base_url + "/populaire-aanbiedingen" + id: Mapped[int] = mapped_column(primary_key=True) + product: Mapped[str] = mapped_column(String(255)) + store: Mapped[str] = mapped_column(String(255)) + price: Mapped[float] = mapped_column(Float) + since: Mapped[datetime] = mapped_column(DateTime) + until: Mapped[datetime] = mapped_column(DateTime) - curl = requests.get(init_url) - curl.raise_for_status() - proompt = """Given an HTML input, give me the product name(s), store(s) and price(s). Please lowercase all names. +def fetch_page_data(client: genai.Client, url: str) -> list[ModelProductResponse]: + curl = requests.get(url) + curl.raise_for_status() - %s - """ + prompt = f"Parse HTML input into JSON and lowercase all names.\n\n{curl.text}" response = client.models.generate_content( model="gemini-2.0-flash", - contents=proompt % curl.text, + contents=prompt, config={ "response_mime_type": "application/json", "response_schema": list[ModelProductResponse], }, ) - model_responses: list[ModelProductResponse] = response.parsed - products: list[Product] = [] + return response.parsed + + +def main(): + engine = create_engine("mysql+pymysql://folderz:folderz@localhost:3306/folderz") + Base.metadata.create_all(engine) + + key = os.getenv("API_KEY") + client = genai.Client(api_key=key) + + base_url = "https://www.folderz.nl" + url = base_url + "/populaire-aanbiedingen" + + while url: + discounts: list[Discount] = [] + + with Session(engine) as session: + LOG.info(f"querying {url}") + + for response in fetch_page_data(client, url): + url = base_url + response.next_page_url + now = datetime.now(timezone.utc) + + discount = Discount( + product=response.name, + store=response.store, + price=response.price, + since=now, + until=now + timedelta(days=response.expires_in_days), + ) + + discounts.append(discount) - for response in model_responses: - print(response.current_page, response.next_page_url) + session.add_all(discounts) + session.commit() - products.append( - Product(**response.model_dump(), date=datetime.now(timezone.utc)) - ) + time.sleep(1) if __name__ == "__main__": |