summaryrefslogtreecommitdiff
path: root/folderz-parser.py
blob: c8593327a733597b8996ee52fd314de6b8e475c1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import requests
import time
import logging

from datetime import datetime, timezone, timedelta

from sqlalchemy import DateTime, Float, String, create_engine
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, Session
from pydantic import BaseModel
from google import genai


LOG = logging.getLogger(__name__)


class ModelProductResponse(BaseModel):
    name: str
    store: str
    price: float
    expires_in_days: int
    current_page: int
    next_page_url: str


class Base(DeclarativeBase):
    pass


class Discount(Base):
    __tablename__ = "discount"

    id: Mapped[int] = mapped_column(primary_key=True)
    product: Mapped[str] = mapped_column(String(255))
    store: Mapped[str] = mapped_column(String(255))
    price: Mapped[float] = mapped_column(Float)
    since: Mapped[datetime] = mapped_column(DateTime)
    until: Mapped[datetime] = mapped_column(DateTime)


def fetch_page_data(client: genai.Client, url: str) -> list[ModelProductResponse]:
    curl = requests.get(url)
    curl.raise_for_status()

    prompt = f"Parse HTML input into JSON and lowercase all names.\n\n{curl.text}"

    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt,
        config={
            "response_mime_type": "application/json",
            "response_schema": list[ModelProductResponse],
        },
    )

    return response.parsed


def main():
    engine = create_engine("mysql+pymysql://folderz:folderz@localhost:3306/folderz")
    Base.metadata.create_all(engine)

    key = os.getenv("API_KEY")
    client = genai.Client(api_key=key)

    base_url = "https://www.folderz.nl"
    url = base_url + "/populaire-aanbiedingen"

    while url:
        discounts: list[Discount] = []

        with Session(engine) as session:
            LOG.info(f"querying {url}")

            for response in fetch_page_data(client, url):
                url = base_url + response.next_page_url
                now = datetime.now(timezone.utc)

                discount = Discount(
                    product=response.name,
                    store=response.store,
                    price=response.price,
                    since=now,
                    until=now + timedelta(days=response.expires_in_days),
                )

                discounts.append(discount)

            session.add_all(discounts)
            session.commit()

        time.sleep(1)


if __name__ == "__main__":
    main()