diff options
Diffstat (limited to 'folderz-parser.py')
-rw-r--r-- | folderz-parser.py | 96 |
1 files changed, 0 insertions, 96 deletions
diff --git a/folderz-parser.py b/folderz-parser.py deleted file mode 100644 index c859332..0000000 --- a/folderz-parser.py +++ /dev/null @@ -1,96 +0,0 @@ -import os -import requests -import time -import logging - -from datetime import datetime, timezone, timedelta - -from sqlalchemy import DateTime, Float, String, create_engine -from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, Session -from pydantic import BaseModel -from google import genai - - -LOG = logging.getLogger(__name__) - - -class ModelProductResponse(BaseModel): - name: str - store: str - price: float - expires_in_days: int - current_page: int - next_page_url: str - - -class Base(DeclarativeBase): - pass - - -class Discount(Base): - __tablename__ = "discount" - - id: Mapped[int] = mapped_column(primary_key=True) - product: Mapped[str] = mapped_column(String(255)) - store: Mapped[str] = mapped_column(String(255)) - price: Mapped[float] = mapped_column(Float) - since: Mapped[datetime] = mapped_column(DateTime) - until: Mapped[datetime] = mapped_column(DateTime) - - -def fetch_page_data(client: genai.Client, url: str) -> list[ModelProductResponse]: - curl = requests.get(url) - curl.raise_for_status() - - prompt = f"Parse HTML input into JSON and lowercase all names.\n\n{curl.text}" - - response = client.models.generate_content( - model="gemini-2.0-flash", - contents=prompt, - config={ - "response_mime_type": "application/json", - "response_schema": list[ModelProductResponse], - }, - ) - - return response.parsed - - -def main(): - engine = create_engine("mysql+pymysql://folderz:folderz@localhost:3306/folderz") - Base.metadata.create_all(engine) - - key = os.getenv("API_KEY") - client = genai.Client(api_key=key) - - base_url = "https://www.folderz.nl" - url = base_url + "/populaire-aanbiedingen" - - while url: - discounts: list[Discount] = [] - - with Session(engine) as session: - LOG.info(f"querying {url}") - - for response in fetch_page_data(client, url): - url = base_url + response.next_page_url - now = datetime.now(timezone.utc) - - discount = Discount( - product=response.name, - store=response.store, - price=response.price, - since=now, - until=now + timedelta(days=response.expires_in_days), - ) - - discounts.append(discount) - - session.add_all(discounts) - session.commit() - - time.sleep(1) - - -if __name__ == "__main__": - main() |