summaryrefslogtreecommitdiff
path: root/folderz-parser.py
diff options
context:
space:
mode:
Diffstat (limited to 'folderz-parser.py')
-rw-r--r--folderz-parser.py80
1 files changed, 59 insertions, 21 deletions
diff --git a/folderz-parser.py b/folderz-parser.py
index 610833a..c859332 100644
--- a/folderz-parser.py
+++ b/folderz-parser.py
@@ -1,57 +1,95 @@
import os
import requests
+import time
+import logging
-from datetime import datetime, timezone
+from datetime import datetime, timezone, timedelta
+from sqlalchemy import DateTime, Float, String, create_engine
+from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, Session
from pydantic import BaseModel
from google import genai
+LOG = logging.getLogger(__name__)
+
+
class ModelProductResponse(BaseModel):
name: str
store: str
price: float
+ expires_in_days: int
current_page: int
next_page_url: str
-class Product(ModelProductResponse):
- date: datetime
+class Base(DeclarativeBase):
+ pass
-def main():
- key = os.getenv("API_KEY")
- client = genai.Client(api_key=key)
+class Discount(Base):
+ __tablename__ = "discount"
- base_url = "https://www.folderz.nl"
- init_url = base_url + "/populaire-aanbiedingen"
+ id: Mapped[int] = mapped_column(primary_key=True)
+ product: Mapped[str] = mapped_column(String(255))
+ store: Mapped[str] = mapped_column(String(255))
+ price: Mapped[float] = mapped_column(Float)
+ since: Mapped[datetime] = mapped_column(DateTime)
+ until: Mapped[datetime] = mapped_column(DateTime)
- curl = requests.get(init_url)
- curl.raise_for_status()
- proompt = """Given an HTML input, give me the product name(s), store(s) and price(s). Please lowercase all names.
+def fetch_page_data(client: genai.Client, url: str) -> list[ModelProductResponse]:
+ curl = requests.get(url)
+ curl.raise_for_status()
- %s
- """
+ prompt = f"Parse HTML input into JSON and lowercase all names.\n\n{curl.text}"
response = client.models.generate_content(
model="gemini-2.0-flash",
- contents=proompt % curl.text,
+ contents=prompt,
config={
"response_mime_type": "application/json",
"response_schema": list[ModelProductResponse],
},
)
- model_responses: list[ModelProductResponse] = response.parsed
- products: list[Product] = []
+ return response.parsed
+
+
+def main():
+ engine = create_engine("mysql+pymysql://folderz:folderz@localhost:3306/folderz")
+ Base.metadata.create_all(engine)
+
+ key = os.getenv("API_KEY")
+ client = genai.Client(api_key=key)
+
+ base_url = "https://www.folderz.nl"
+ url = base_url + "/populaire-aanbiedingen"
+
+ while url:
+ discounts: list[Discount] = []
+
+ with Session(engine) as session:
+ LOG.info(f"querying {url}")
+
+ for response in fetch_page_data(client, url):
+ url = base_url + response.next_page_url
+ now = datetime.now(timezone.utc)
+
+ discount = Discount(
+ product=response.name,
+ store=response.store,
+ price=response.price,
+ since=now,
+ until=now + timedelta(days=response.expires_in_days),
+ )
+
+ discounts.append(discount)
- for response in model_responses:
- print(response.current_page, response.next_page_url)
+ session.add_all(discounts)
+ session.commit()
- products.append(
- Product(**response.model_dump(), date=datetime.now(timezone.utc))
- )
+ time.sleep(1)
if __name__ == "__main__":