commit bf7cabc9d8f8f5bb9b5c6688257ba2ccd5391718 Author: cartsnitch-ceo[bot] <269712056+cartsnitch-ceo[bot]@users.noreply.github.com> Date: Sun Apr 19 02:40:14 2026 +0000 release: fix HIGH-severity CVEs in receiptwitness image (UAT+Security PASS) release: fix HIGH-severity CVEs in receiptwitness image (UAT+Security PASS) diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..289a751 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,12 @@ +__pycache__/ +*.pyc +.pytest_cache/ +*.egg-info/ +dist/ +.venv/ +.env +.git/ +.github/ +tests/ +*.md +renovate.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..687387e --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +__pycache__/ +*.pyc +.pytest_cache/ +*.egg-info/ +dist/ +.venv/ +.env diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..255b742 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,227 @@ +# ReceiptWitness — CartSnitch Receipt Ingestion Service + +## Project Context + +CartSnitch is a self-hosted grocery price intelligence platform built as a polyrepo microservices architecture. This repo (`cartsnitch/receiptwitness`) is the receipt/purchase history ingestion service. + +**GitHub org:** github.com/cartsnitch +**Domain:** cartsnitch.com + +### CartSnitch Services + +| Repo | Service | Purpose | +|------|---------|---------| +| `cartsnitch/common` | — | Shared models, schemas, utilities | +| `cartsnitch/receiptwitness` | ReceiptWitness | Purchase data ingestion via retailer scrapers (this repo) | +| `cartsnitch/api` | API Gateway | Frontend-facing REST API | +| `cartsnitch/cartsnitch` | Frontend | React PWA (mobile-first) | +| `cartsnitch/stickershock` | StickerShock | Price increase detection & CPI comparison | +| `cartsnitch/shrinkray` | ShrinkRay | Shrinkflation monitoring | +| `cartsnitch/clipartist` | ClipArtist | Coupon/deal watching & shopping optimization | +| `cartsnitch/infra` | — | K8s manifests, Flux kustomizations | + +### Architecture Decisions + +- **Polyrepo:** Each service has its own repo, Dockerfile, CI/CD pipeline. +- **Shared DB:** One PostgreSQL cluster. This service writes to `purchases`, `purchase_items`, `price_history` tables. Models come from `cartsnitch-common`. +- **Inter-service comms:** REST (synchronous) + Redis pub/sub (async events). +- **Target scale:** 500–1,000 users. Each user has their own authenticated sessions to up to 3 retailers. + +## What This Service Does + +ReceiptWitness authenticates with grocery retailer web portals using per-user sessions, scrapes purchase history / receipt data, parses it into structured records, and writes it to the shared database. After ingestion, it publishes a `cartsnitch.receipts.ingested` event so downstream services (StickerShock, ClipArtist) can react. + +### Target Retailers (MVP) + +#### Meijer (mPerks) +- **Auth:** No public API. Session cookie-based auth on mperks.meijer.com. +- **Receipt location:** meijer.com/mperks/receipts-savings.html (or underlying XHR endpoints) +- **Approach:** Playwright login → capture session → hit receipt XHR endpoints directly. Map the API calls the frontend makes via browser dev tools network tab. +- **Prior art:** `dapperfu/python_Meijer` (requires MITM proxy for auth — avoid this pattern, prefer direct browser automation). +- **Data available:** Digital receipts appear ~15 minutes after purchase if mPerks ID was used at checkout. Includes item names, prices, discounts, savings. + +#### Kroger +- **Auth:** No public API for purchase history (that's behind Partner API). Session cookie-based auth on kroger.com. +- **Receipt location:** kroger.com/mypurchases +- **Approach:** Playwright login → scrape purchase history pages or intercept XHR endpoints. +- **Anti-bot:** Kroger uses Akamai Bot Manager. Aggressive headless browser detection. Need Playwright stealth, realistic fingerprinting, human-like interaction pacing. +- **Prior art:** `phyllis-vance/KrogerScrape` (.NET, old), `callaginn/kroger-sweeper` (Puppeteer/Node), `ThermoMan/Get-Kroger-Grocery-List` (Greasemonkey userscript). +- **Kroger public API:** Free developer account at developer.kroger.com provides product catalog data (`product.compact` scope) — useful for enriching scraped receipt data with UPCs, categories, product images. NOT useful for purchase history. +- **Data available:** Purchase history tied to Kroger Plus loyalty card. Shows items, prices, quantities. + +#### Target (Circle) +- **Auth:** Session-based auth on target.com. +- **Receipt location:** target.com account → Orders → In-store tab, or target.com/account/orders +- **Approach:** Playwright login → scrape in-store purchase history. +- **Data available:** ~1 year of history if user paid with a linked card, used the Target app wallet, or entered their Target Circle phone number at checkout. Includes item names, prices. + +## Tech Stack + +- Python 3.12+ +- Playwright (Python async API) for headless browser automation +- FastAPI (lightweight internal API for triggering scrapes, health checks, status) +- SQLAlchemy 2.0 (via `cartsnitch-common`) +- Redis (pub/sub event publishing) +- APScheduler or Celery (for scheduled scraping jobs) +- cryptography / Fernet (encrypting stored session data) + +## Repo Structure + +``` +receiptwitness/ +├── CLAUDE.md +├── README.md +├── pyproject.toml +├── Dockerfile # Playwright + Chromium headless +├── docker-compose.yml # Local dev (Postgres, Redis, this service) +├── src/ +│ └── receiptwitness/ +│ ├── __init__.py +│ ├── config.py # Service-specific settings +│ ├── main.py # FastAPI app + scheduler bootstrap +│ ├── scrapers/ +│ │ ├── __init__.py +│ │ ├── base.py # Abstract BaseScraper class +│ │ ├── meijer.py # Meijer/mPerks scraper +│ │ ├── kroger.py # Kroger scraper +│ │ └── target.py # Target/Circle scraper +│ ├── parsers/ +│ │ ├── __init__.py +│ │ ├── meijer.py # Parse raw Meijer receipt data → PurchaseItem records +│ │ ├── kroger.py +│ │ └── target.py +│ ├── session/ +│ │ ├── __init__.py +│ │ ├── manager.py # Session storage, retrieval, refresh logic +│ │ └── encryption.py # Encrypt/decrypt session cookies at rest +│ ├── scheduler.py # Scrape scheduling (per-user cron jobs) +│ ├── events.py # Publish receipt.ingested events to Redis +│ ├── api/ +│ │ ├── __init__.py +│ │ ├── routes.py # Internal API: trigger scrape, check status, health +│ │ └── auth.py # Internal service auth (API key or JWT) +│ └── enrichment.py # Optional: enrich receipt data via Kroger public API +└── tests/ + ├── conftest.py + ├── fixtures/ # Sample receipt HTML/JSON for testing parsers + │ ├── meijer_receipt.json + │ ├── kroger_receipt.html + │ └── target_receipt.html + ├── test_scrapers/ + ├── test_parsers/ + └── test_session/ +``` + +## Scraper Architecture + +### Base Scraper Pattern + +```python +class BaseScraper(ABC): + """All retailer scrapers implement this interface.""" + + @abstractmethod + async def login(self, credentials: UserStoreAccount) -> SessionData: ... + + @abstractmethod + async def check_session(self, session: SessionData) -> bool: ... + + @abstractmethod + async def scrape_receipts(self, session: SessionData, since: datetime | None) -> list[RawReceipt]: ... + + @abstractmethod + def parse_receipt(self, raw: RawReceipt) -> tuple[Purchase, list[PurchaseItem]]: ... +``` + +### Scraping Flow + +1. **Scheduler fires** for a user+store combination +2. **Load session** from `user_store_accounts` table (encrypted) +3. **Check session validity** — quick lightweight request to verify auth +4. **If expired:** launch Playwright, re-authenticate, save new session +5. **Scrape receipts** since `last_sync_at` timestamp +6. **Parse** raw data into `Purchase` and `PurchaseItem` records +7. **Deduplicate** — skip receipts already in DB (match on `receipt_id` per store) +8. **Write to DB** — insert new purchases and items +9. **Derive price_history** entries from purchase_items +10. **Publish event** — `cartsnitch.receipts.ingested` to Redis +11. **Update** `user_store_accounts.last_sync_at` + +### Session Management + +- Sessions (cookies, tokens) are encrypted at rest using Fernet symmetric encryption. +- The encryption key is provided via environment variable, not stored in the DB. +- Sessions are stored in the `user_store_accounts` table as encrypted JSONB. +- Each scrape attempt first checks if the existing session is valid before launching a full Playwright browser instance. +- When a session expires, the service needs the user's stored credentials OR a manual re-auth flow (the user logs in via the frontend, and we capture the session). + +### Anti-Bot Considerations + +- Use `playwright-stealth` or equivalent to mask automation signals. +- Set realistic viewport sizes, user agents, and locale settings. +- Add human-like delays between page navigations (randomized 1-5 seconds). +- For Kroger specifically (Akamai Bot Manager): may need to use non-headless mode on initial auth, or route through a persistent browser profile that has established trust. +- Rate limit scraping: no more than 1 scrape per user per store per hour. Default cadence: once daily. +- Store and reuse browser profiles/cookies to minimize fresh logins. + +### Dockerfile + +The Dockerfile must include Playwright and Chromium. Base image pattern: + +```dockerfile +FROM mcr.microsoft.com/playwright/python:v1.49.0-noble +# Install deps, copy code, etc. +``` + +This is a large image (~2GB) due to Chromium. Consider multi-stage builds if the final image can be slimmed down. + +## Internal API Endpoints + +This service exposes a lightweight internal API (not public-facing): + +- `GET /health` — health check +- `GET /status/{user_id}` — sync status per store for a user +- `POST /scrape/{user_id}/{store_slug}` — trigger an immediate scrape for a user+store +- `POST /scrape/{user_id}/all` — trigger scrape across all configured stores +- `GET /sessions/{user_id}` — list configured store sessions and their status + +The public-facing API gateway (`cartsnitch/api`) proxies user-facing requests to this service's internal API. + +## Events Published + +### `cartsnitch.receipts.ingested` + +Published after new receipt data is successfully written to the DB. + +```json +{ + "event_type": "cartsnitch.receipts.ingested", + "timestamp": "2026-03-15T12:00:00Z", + "service": "receiptwitness", + "payload": { + "user_id": "uuid", + "store_slug": "meijer", + "purchase_id": "uuid", + "purchase_date": "2026-03-14", + "item_count": 23, + "total": 87.42 + } +} +``` + +## Development Workflow + +- **Never push directly to main.** Always create feature branches and open PRs. +- Branch naming: `feature//` or `fix/` +- Use conventional commits: `feat:`, `fix:`, `refactor:`, `docs:`, `chore:` +- Test parsers with fixture data (sample receipts in `tests/fixtures/`). Scraper integration tests require real credentials and should be tagged/skipped in CI. +- Local dev: `docker-compose up` starts Postgres, Redis, and the service. Playwright runs inside the container. + +## Important Notes + +- The Playwright container image is large. On K8s, consider using a dedicated node or tolerating scheduling delays. +- Each user needs their own authenticated sessions. At 1,000 users × 3 stores = 3,000 sessions to manage. Sessions expire at different rates per retailer. +- Scraping must be respectful: randomized intervals, rate limiting, no parallel scraping of the same store for the same user. +- Receipt data structure varies significantly between retailers. The parsers must be robust and handle edge cases (returns, voided items, weighted produce, BOGO items, coupon stacking). +- Kroger's public API (`product.compact` scope) can be used to enrich scraped data with UPCs and product metadata after receipt parsing. This is optional but improves product normalization downstream. +- Store credentials for users should ideally NOT be stored by CartSnitch. Prefer a flow where the user authenticates in a controlled browser session, and we capture/store only the resulting session cookies. If credential storage is necessary, use strong encryption and make the tradeoffs clear to users. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..65418d2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,66 @@ +# Stage 1: Build dependencies +FROM python:3.12-slim AS build + +WORKDIR /app + +# build-essential and libpq-dev are needed to compile any C-extension wheels +# (e.g. psycopg2 fallback). No git needed — common/ is copied from the repo root. +ARG APT_CACHE_BUST=1 +RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ + libpq-dev \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +# Build context is the repo root. These paths are relative to the root. +COPY receiptwitness/pyproject.toml ./ +COPY receiptwitness/src/ ./src/ +COPY common/ ./common/ + +# Install from the local common/ (cartsnitch-common>=0.1.0 in pyproject.toml +# will be satisfied by the local package) then install receiptwitness itself. +RUN pip install --no-cache-dir --prefix=/install ./common/ . + +# Stage 2: Production image with Playwright + Chromium +FROM python:3.12-slim AS prod + +WORKDIR /app + +# Install Playwright system dependencies for Chromium +ARG APT_CACHE_BUST=1 +RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ + libnss3 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libdrm2 \ + libxkbcommon0 \ + libxcomposite1 \ + libxdamage1 \ + libxrandr2 \ + libgbm1 \ + libpango-1.0-0 \ + libcairo2 \ + libasound2 \ + libxshmfence1 \ + libx11-xcb1 \ + libxcb-dri3-0 \ + fonts-liberation \ + && rm -rf /var/lib/apt/lists/* + +RUN adduser --system --group --uid 1000 app + +COPY --from=build /install /usr/local +COPY receiptwitness/src/ ./src/ + +# Install Playwright Chromium browser (runs as root; /opt/playwright is world-readable) +RUN PLAYWRIGHT_BROWSERS_PATH=/opt/playwright playwright install chromium + +ENV PLAYWRIGHT_BROWSERS_PATH=/opt/playwright + +USER 1000 +EXPOSE 8000 + +HEALTHCHECK --interval=30s --timeout=3s \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" + +CMD ["uvicorn", "receiptwitness.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a698913 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,59 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "receiptwitness" +version = "0.1.0" +description = "CartSnitch receipt/purchase history ingestion service" +requires-python = ">=3.12" +dependencies = [ + "cartsnitch-common>=0.1.0", + "playwright>=1.49,<2.0", + "playwright-stealth>=1.0,<2.0", + "cryptography>=46.0,<47.0", + "fastapi>=0.115,<1.0", + "uvicorn[standard]>=0.30,<1.0", + "beautifulsoup4>=4.12,<5.0", + "redis>=5.0,<6.0", + "pydantic>=2.0,<3.0", + "pydantic-settings>=2.0,<3.0", + "sqlalchemy[asyncio]>=2.0,<3.0", + "asyncpg>=0.29,<1.0", + "resend>=2.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0", + "pytest-asyncio>=0.23", + "ruff>=0.3", + "pytest-cov>=5.0", + "fakeredis[aioredis]>=2.20", + "httpx>=0.27", + "python-multipart>=0.0.9", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/receiptwitness"] + +[tool.ruff] +target-version = "py312" +line-length = 100 + +[tool.ruff.lint] +select = ["E", "F", "I", "N", "W", "UP"] + +[tool.mypy] +python_version = "3.12" +strict = false +warn_return_any = true +warn_unused_ignores = true + +[[tool.mypy.overrides]] +module = "cartsnitch_common.*" +ignore_missing_imports = true + +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = ["tests"] diff --git a/renovate.json b/renovate.json new file mode 100644 index 0000000..833ba3b --- /dev/null +++ b/renovate.json @@ -0,0 +1,4 @@ +{ + "$schema": "https://docs.renovatebot.com/renovate-schema.json", + "extends": ["local>cartsnitch/.github:renovate-config"] +} diff --git a/src/receiptwitness/__init__.py b/src/receiptwitness/__init__.py new file mode 100644 index 0000000..6b17aab --- /dev/null +++ b/src/receiptwitness/__init__.py @@ -0,0 +1 @@ +"""ReceiptWitness — CartSnitch receipt ingestion service.""" diff --git a/src/receiptwitness/api/__init__.py b/src/receiptwitness/api/__init__.py new file mode 100644 index 0000000..74ded59 --- /dev/null +++ b/src/receiptwitness/api/__init__.py @@ -0,0 +1 @@ +"""Internal API for ReceiptWitness service.""" diff --git a/src/receiptwitness/api/routes.py b/src/receiptwitness/api/routes.py new file mode 100644 index 0000000..2437bcc --- /dev/null +++ b/src/receiptwitness/api/routes.py @@ -0,0 +1,66 @@ +"""Internal API routes for triggering scrapes and checking status.""" + +import hashlib +import hmac +import re +import time + +from fastapi import APIRouter, HTTPException, Request + +from receiptwitness.config import settings +from receiptwitness.queue.email import EmailJob, enqueue_email, get_redis + +router = APIRouter() + +TOKEN_PATTERN = re.compile(r"receipts\+([A-Za-z0-9_-]+)@") + + +def verify_mailgun_signature(token: str, timestamp: str, signature: str) -> bool: + """Verify Mailgun webhook signature.""" + try: + ts = int(timestamp) + except (ValueError, TypeError): + return False + if abs(time.time() - ts) > 300: # 5 min freshness + return False + key = settings.mailgun_webhook_signing_key.encode() + hmac_digest = hmac.new(key, f"{timestamp}{token}".encode(), hashlib.sha256).hexdigest() + return hmac.compare_digest(signature, hmac_digest) + + +@router.post("/inbound/email") +async def receive_inbound_email(request: Request): + form = await request.form() + # 1. Verify Mailgun signature + token = str(form.get("token", "")) + timestamp = str(form.get("timestamp", "")) + signature = str(form.get("signature", "")) + if not verify_mailgun_signature(token, timestamp, signature): + raise HTTPException(status_code=406, detail="Invalid signature") + # 2. Extract account token from recipient + recipient = str(form.get("recipient", "")) + match = TOKEN_PATTERN.search(recipient) + if not match: + raise HTTPException(status_code=406, detail="Invalid recipient") + account_token = match.group(1) + # 3. Enqueue — worker resolves token -> user_id + body_html_val = form.get("body-html") + body_plain_val = form.get("body-plain") + job = EmailJob( + user_id=account_token, + sender=str(form.get("sender", "")), + recipient=recipient, + subject=str(form.get("subject", "")), + body_html=str(body_html_val) if body_html_val is not None else None, + body_plain=str(body_plain_val) if body_plain_val is not None else None, + received_at=str(form.get("timestamp", "")), + message_id=str(form.get("Message-Id", "")), + ) + client = await get_redis() + await enqueue_email(client, job) + return {"status": "queued"} + + +@router.get("/health") +async def health(): + return {"status": "ok", "service": "receiptwitness"} diff --git a/src/receiptwitness/config.py b/src/receiptwitness/config.py new file mode 100644 index 0000000..b9d2574 --- /dev/null +++ b/src/receiptwitness/config.py @@ -0,0 +1,67 @@ +"""Service-specific configuration for ReceiptWitness.""" + +from pydantic import model_validator +from pydantic_settings import BaseSettings + + +_PLACEHOLDER_VALUES = {"change-me-in-production"} + + +class ReceiptWitnessSettings(BaseSettings): + model_config = {"env_prefix": "RW_"} + + # Inherited from cartsnitch-common + database_url: str = "postgresql+asyncpg://cartsnitch:cartsnitch@localhost:5432/cartsnitch" + redis_url: str = "redis://localhost:6379/0" + + # Session encryption + session_encryption_key: str = "" + + # Scraping defaults + scrape_interval_seconds: int = 86400 # 24 hours + min_request_delay_ms: int = 1000 + max_request_delay_ms: int = 5000 + + # Playwright + headless: bool = True + browser_timeout_ms: int = 60000 + + # Email notifications (Resend) + resend_api_key: str = "" + notification_email_from: str = "notifications@cartsnitch.com" + notifications_enabled: bool = False + + # Mailgun inbound email webhook + mailgun_webhook_signing_key: str = "" + + @model_validator(mode="after") + def validate_required_vars(self): + errors = [] + if not self.session_encryption_key or self.session_encryption_key in _PLACEHOLDER_VALUES: + errors.append( + "RW_SESSION_ENCRYPTION_KEY must be set to a secure value. " + 'Generate one with: python -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())"' + ) + if self.notifications_enabled and not self.resend_api_key: + errors.append( + "RW_RESEND_API_KEY must be set when RW_NOTIFICATIONS_ENABLED=true. " + "Get an API key from https://resend.com/api-keys" + ) + if errors: + raise ValueError( + "ReceiptWitness startup failed — missing required config:\n" + + "\n".join(f" - {e}" for e in errors) + ) + return self + + +class _LazySettings: + _instance: ReceiptWitnessSettings | None = None + + def __getattr__(self, name: str): + if _LazySettings._instance is None: + _LazySettings._instance = ReceiptWitnessSettings() + return getattr(_LazySettings._instance, name) + + +settings = _LazySettings() diff --git a/src/receiptwitness/events.py b/src/receiptwitness/events.py new file mode 100644 index 0000000..a9e6204 --- /dev/null +++ b/src/receiptwitness/events.py @@ -0,0 +1,113 @@ +"""Publish receipt ingestion events to Redis/DragonflyDB pub/sub.""" + +import json +import logging +import uuid +from datetime import UTC, datetime +from decimal import Decimal + +import redis.asyncio as aioredis +from cartsnitch_common.database import get_async_session_factory +from cartsnitch_common.models.user import User +from sqlalchemy import select + +from receiptwitness.config import settings +from receiptwitness.notifications.email import send_receipt_notification + +logger = logging.getLogger(__name__) + +CHANNEL_RECEIPTS_INGESTED = "cartsnitch.receipts.ingested" + +# Module-level connection pool — shared across all publish calls +_pool: aioredis.ConnectionPool | None = None + + +class _DecimalEncoder(json.JSONEncoder): + def default(self, o): + if isinstance(o, Decimal): + return float(o) + return super().default(o) + + +def _get_pool() -> aioredis.ConnectionPool: + """Get or create the shared Redis connection pool.""" + global _pool + if _pool is None: + _pool = aioredis.ConnectionPool.from_url( + settings.redis_url, decode_responses=True, max_connections=10 + ) + return _pool + + +async def get_redis_client() -> aioredis.Redis: + """Create an async Redis/DragonflyDB client with connection pooling.""" + return aioredis.Redis(connection_pool=_get_pool()) + + +async def _send_notification_for_event(payload: dict) -> None: + """Look up user email and send receipt notification. Silently skips on error.""" + try: + user_uuid = uuid.UUID(payload["user_id"]) + except (ValueError, KeyError): + logger.warning("Invalid user_id in event payload: %s", payload.get("user_id")) + return + + try: + session_factory = get_async_session_factory(settings.database_url) + async with session_factory() as session: + result = await session.execute(select(User.email).where(User.id == user_uuid)) + row = result.scalar_one_or_none() + if not row: + logger.warning("User %s not found for notification", user_uuid) + return + user_email = row + except Exception: + logger.exception("Failed to look up user email for notification") + return + + await send_receipt_notification( + user_email=user_email, + store_name=payload["store_slug"], + item_count=payload["item_count"], + total=payload["total"], + purchase_date=payload["purchase_date"], + ) + + +async def publish_receipt_ingested( + user_id: str, + store_slug: str, + purchase_id: str, + purchase_date: str, + item_count: int, + total: Decimal | float, +) -> None: + """Publish a cartsnitch.receipts.ingested event after successful ingestion.""" + payload = { + "user_id": user_id, + "store_slug": store_slug, + "purchase_id": purchase_id, + "purchase_date": purchase_date, + "item_count": item_count, + "total": float(total) if isinstance(total, Decimal) else total, + } + event = { + "event_type": CHANNEL_RECEIPTS_INGESTED, + "timestamp": datetime.now(UTC).isoformat(), + "service": "receiptwitness", + "payload": payload, + } + + try: + client = await get_redis_client() + await client.publish(CHANNEL_RECEIPTS_INGESTED, json.dumps(event, cls=_DecimalEncoder)) + logger.info( + "Published %s event for purchase %s", + CHANNEL_RECEIPTS_INGESTED, + purchase_id, + ) + except aioredis.ConnectionError: + logger.error("Failed to publish event — Redis/DragonflyDB connection error") + raise + else: + await _send_notification_for_event(payload) diff --git a/src/receiptwitness/main.py b/src/receiptwitness/main.py new file mode 100644 index 0000000..55cda42 --- /dev/null +++ b/src/receiptwitness/main.py @@ -0,0 +1,8 @@ +"""FastAPI app entrypoint for ReceiptWitness.""" + +from fastapi import FastAPI + +from receiptwitness.api.routes import router + +app = FastAPI(title="ReceiptWitness", version="0.1.0") +app.include_router(router) diff --git a/src/receiptwitness/notifications/__init__.py b/src/receiptwitness/notifications/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/receiptwitness/notifications/email.py b/src/receiptwitness/notifications/email.py new file mode 100644 index 0000000..d5723f2 --- /dev/null +++ b/src/receiptwitness/notifications/email.py @@ -0,0 +1,45 @@ +"""Email notifications via Resend.""" + +import asyncio +import html +import logging + +import resend + +from receiptwitness.config import settings + +logger = logging.getLogger(__name__) + + +async def send_receipt_notification( + user_email: str, + store_name: str, + item_count: int, + total: float, + purchase_date: str, +) -> None: + """Send receipt ingestion confirmation email via Resend.""" + if not settings.notifications_enabled or not settings.resend_api_key: + logger.debug("Notifications disabled — skipping email send") + return + + resend.api_key = settings.resend_api_key + store_name_safe = html.escape(store_name) + purchase_date_safe = html.escape(purchase_date) + try: + await asyncio.to_thread( + resend.Emails.send, + { + "from": settings.notification_email_from, + "to": [user_email], + "subject": f"Receipt processed: {store_name} - ${total:.2f}", + "html": ( + f"

Your receipt from {store_name_safe} on " + f"{purchase_date_safe} has been processed.

" + f"

{item_count} items, total: ${total:.2f}

" + ), + }, + ) + logger.info("Receipt notification sent to %s", user_email) + except Exception: + logger.exception("Failed to send receipt notification to %s", user_email) diff --git a/src/receiptwitness/parsers/__init__.py b/src/receiptwitness/parsers/__init__.py new file mode 100644 index 0000000..2b56ce8 --- /dev/null +++ b/src/receiptwitness/parsers/__init__.py @@ -0,0 +1 @@ +"""Receipt parsers for each retailer.""" diff --git a/src/receiptwitness/parsers/email/__init__.py b/src/receiptwitness/parsers/email/__init__.py new file mode 100644 index 0000000..9d01da5 --- /dev/null +++ b/src/receiptwitness/parsers/email/__init__.py @@ -0,0 +1 @@ +"""Email receipt parsers for retailer email receipts.""" diff --git a/src/receiptwitness/parsers/email/base.py b/src/receiptwitness/parsers/email/base.py new file mode 100644 index 0000000..a25535e --- /dev/null +++ b/src/receiptwitness/parsers/email/base.py @@ -0,0 +1,32 @@ +"""Base interface for email receipt parsers.""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field + + +@dataclass +class EmailReceipt: + """Raw email data before parsing.""" + + sender: str + recipient: str + subject: str + body_html: str | None = None + body_plain: str | None = None + received_at: str | None = None + raw_headers: dict = field(default_factory=dict) + + +class BaseEmailParser(ABC): + """All retailer email parsers implement this interface.""" + + @abstractmethod + def can_parse(self, email: EmailReceipt) -> bool: + """Return True if this parser handles this email.""" + ... + + @abstractmethod + def parse(self, email: EmailReceipt) -> dict: + """Parse email into a dict matching PurchaseCreate schema fields. + Must include an items list matching PurchaseItemCreate fields.""" + ... diff --git a/src/receiptwitness/parsers/email/detector.py b/src/receiptwitness/parsers/email/detector.py new file mode 100644 index 0000000..e71f769 --- /dev/null +++ b/src/receiptwitness/parsers/email/detector.py @@ -0,0 +1,25 @@ +"""Detect which retailer sent a receipt email.""" + +import re + +from receiptwitness.parsers.email.base import EmailReceipt + +RETAILER_PATTERNS: dict[str, list[str]] = { + "meijer": [r"@meijer\.com$", r"@email\.meijer\.com$"], + "kroger": [r"@kroger\.com$", r"@email\.kroger\.com$"], + "target": [r"@target\.com$", r"@email\.target\.com$"], +} + + +def detect_retailer(email: EmailReceipt) -> str | None: + """Return retailer slug or None if unrecognized.""" + sender = email.sender.lower().strip() + # Extract email from "Name " format + match = re.search(r"<([^>]+)>", sender) + if match: + sender = match.group(1) + for retailer, patterns in RETAILER_PATTERNS.items(): + for pattern in patterns: + if re.search(pattern, sender): + return retailer + return None diff --git a/src/receiptwitness/parsers/email/kroger.py b/src/receiptwitness/parsers/email/kroger.py new file mode 100644 index 0000000..364f59e --- /dev/null +++ b/src/receiptwitness/parsers/email/kroger.py @@ -0,0 +1,157 @@ +"""Kroger email receipt parser.""" + +import logging +import re +from datetime import datetime +from decimal import Decimal, InvalidOperation + +from bs4 import BeautifulSoup + +from receiptwitness.parsers.email.base import BaseEmailParser, EmailReceipt + +logger = logging.getLogger(__name__) + + +def _to_decimal(value: str | float | int | None, default: Decimal = Decimal("0")) -> Decimal: + """Safely convert a value to Decimal.""" + if value is None: + return default + try: + return Decimal(str(value).replace("$", "").replace(",", "").strip()) + except (InvalidOperation, ValueError): + return default + + +def _extract_total(body: str) -> Decimal: + """Extract the transaction total from email body.""" + patterns = [ + r"Total[:\s]*\$?([0-9,]+\.[0-9]{2})", + r"Amount[:\s]*\$?([0-9,]+\.[0-9]{2})", + r"Grand\s+Total[:\s]*\$?([0-9,]+\.[0-9]{2})", + ] + for pattern in patterns: + match = re.search(pattern, body, re.IGNORECASE) + if match: + return _to_decimal(match.group(1)) + return Decimal("0") + + +def _extract_receipt_id(body: str) -> str | None: + """Extract receipt ID / transaction ID from HTML body. + + Strips HTML tags first so that whitespace between delimiters and values + (e.g. from `` KR-2026-0315-4829`` -> `` KR-2026-0315-4829``) + is normalized and the pattern can match cleanly. + """ + stripped = re.sub(r"<[^>]+>", "", body) + patterns = [ + r"Receipt\s*#[:\s]*([A-Z0-9-]+)", + r"Transaction\s*#[:\s]*([A-Z0-9-]+)", + r"Order\s*#[:\s]*([A-Z0-9-]+)", + r"Confirmation\s*#[:\s]*([A-Z0-9-]+)", + ] + for pattern in patterns: + match = re.search(pattern, stripped, re.IGNORECASE) + if match: + return match.group(1) + return None + + +def _extract_date(body: str) -> str: + """Extract purchase date from email body. Returns ISO date string or empty string.""" + patterns = [ + r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})", + r"([A-Z][a-z]{2}\s+\d{1,2},?\s+\d{4})", + ] + for pattern in patterns: + match = re.search(pattern, body) + if match: + raw = match.group(1) + try: + dt = datetime.strptime(raw.replace(",", ""), "%b %d %Y") + return dt.strftime("%Y-%m-%d") + except ValueError: + pass + try: + for fmt in ("%m/%d/%Y", "%m/%d/%y", "%d/%m/%Y", "%d/%m/%y"): + try: + dt = datetime.strptime(raw, fmt) + return dt.strftime("%Y-%m-%d") + except ValueError: + continue + except Exception: + pass + return "" + + +def _extract_items_soup(body: str) -> list[dict]: + """Extract line items from HTML email body using BeautifulSoup.""" + items = [] + try: + soup = BeautifulSoup(body, "html.parser") + text = soup.get_text(separator="\n", strip=True) + # Strip HTML tags from raw body to normalize whitespace + stripped = re.sub(r"<[^>]+>", " ", body) + stripped = re.sub(r"\s+", " ", stripped) + skip_prefixes = ( + "Subtotal", + "Tax", + "Total", + "Kroger", + "Target", + "Date", + "Receipt", + "Order", + "Transaction", + "Confirmation", + "Thank", + "Questions", + "Keep", + "Receipt", + ) + for line in text.split("\n"): + line = line.strip() + if not line or line.startswith(skip_prefixes): + continue + # Match lines like "Product Name $9.99" + match = re.match(r"(.+?)\s+\$([0-9]+\.[0-9]{2})\s*$", line) + if match: + name = match.group(1).strip() + price = _to_decimal(match.group(2)) + if len(name) > 2 and price > 0: + items.append( + { + "product_name_raw": name, + "quantity": Decimal("1"), + "unit_price": price, + "extended_price": price, + } + ) + except Exception: + pass + return items[:20] + + +class KrogerEmailParser(BaseEmailParser): + """Parse Kroger email receipts (digital receipts via kroger.com).""" + + KROGER_KEYWORDS = ("kroger", "kroger.com", "plus") + + def can_parse(self, email: EmailReceipt) -> bool: + sender = (email.sender or "").lower() + body = (email.body_html or email.body_plain or "").lower() + return any(kw in sender or kw in body for kw in self.KROGER_KEYWORDS) + + def parse(self, email: EmailReceipt) -> dict: + body = (email.body_html or email.body_plain or "").strip() + total = _extract_total(body) + receipt_id = _extract_receipt_id(body) or "" + purchase_date = _extract_date(body) + items = _extract_items_soup(body) + + return { + "receipt_id": receipt_id, + "purchase_date": purchase_date, + "total": total, + "items": items, + } diff --git a/src/receiptwitness/parsers/email/meijer.py b/src/receiptwitness/parsers/email/meijer.py new file mode 100644 index 0000000..598acb7 --- /dev/null +++ b/src/receiptwitness/parsers/email/meijer.py @@ -0,0 +1,259 @@ +"""Parse Meijer digital receipt emails into structured purchase data.""" + +import re +from decimal import Decimal, InvalidOperation + +from bs4 import BeautifulSoup +from bs4.element import Tag + +from receiptwitness.parsers.email.base import BaseEmailParser, EmailReceipt + + +def _to_decimal(value, default: str = "0") -> Decimal: + """Safely convert a value to Decimal.""" + if value is None: + return Decimal(default) + try: + return Decimal(str(value).replace("$", "").replace(",", "").strip()) + except (InvalidOperation, ValueError, TypeError): + return Decimal(default) + + +def _extract_receipt_id(soup: BeautifulSoup, subject: str | None) -> str | None: + """Extract receipt/transaction ID from subject or body.""" + if subject: + match = re.search(r"TXN[-\s]\d{4}[-\s]\d{4}[-\s]\d+", subject) + if match: + return match.group(0).replace(" ", "-") + # Fallback: look in body + text = soup.get_text() + match = re.search(r"TXN[-\s]\d{4}[-\s]\d{4}[-\s]\d+", text) + if match: + return match.group(0).replace(" ", "-") + return None + + +def _extract_purchase_date(soup: BeautifulSoup, subject: str | None) -> str | None: + """Extract purchase date from subject or body.""" + text = soup.get_text() + + # Try ISO format first: YYYY-MM-DD + match = re.search(r"(\d{4})-(\d{2})-(\d{2})", text) + if match: + return f"{match.group(1)}-{match.group(2)}-{match.group(3)}" + + # Try written format: March 15, 2026 + match = re.search(r"([A-Za-z]+)\s+(\d{1,2}),?\s+(\d{4})", text) + if match: + month_str = match.group(1).lower() + day = match.group(2) + year = match.group(3) + month_map = { + "january": "01", + "february": "02", + "march": "03", + "april": "04", + "may": "05", + "june": "06", + "july": "07", + "august": "08", + "september": "09", + "october": "10", + "november": "11", + "december": "12", + } + month = month_map.get(month_str) + if month: + return f"{year}-{month}-{day.zfill(2)}" + + # MM/DD/YYYY + match = re.search(r"(\d{1,2})/(\d{1,2})/(\d{4})", text) + if match: + return f"{match.group(3)}-{match.group(1).zfill(2)}-{match.group(2).zfill(2)}" + + return None + + +def _extract_store_info(soup: BeautifulSoup) -> dict: + """Extract store name and number from the email body.""" + store_info: dict = {} + + # Look for store number in header + store_num_match = re.search(r"Meijer\s+Store\s+#?(\d+)", soup.get_text(), re.IGNORECASE) + if store_num_match: + store_info["store_number"] = store_num_match.group(1) + + return store_info + + +def _extract_items(table: Tag | None) -> list[dict]: + """Extract line items from the items table.""" + items: list[dict] = [] + if not table: + return items + + rows = table.find_all("tr") + for row in rows: + cells = row.find_all("td") + if len(cells) < 3: + continue + + name_cell = cells[0].get_text(strip=True) + qty_cell = cells[1].get_text(strip=True) + price_cell = cells[2].get_text(strip=True) + + if not name_cell or name_cell.lower() in ("item", "description"): + continue + + # Skip subtotal/tax/total/savings rows + if any( + label in name_cell.lower() + for label in ("subtotal", "tax", "total", "savings", "grand total") + ): + continue + + try: + quantity = Decimal(qty_cell) + except (InvalidOperation, ValueError, TypeError): + quantity = Decimal("1") + + price_str = price_cell.replace("$", "").replace(",", "").strip() + try: + unit_price = Decimal(price_str) + except (InvalidOperation, ValueError, TypeError): + unit_price = Decimal("0") + + extended_price = unit_price # Default to unit price; no qty column in fixture + + items.append( + { + "product_name_raw": name_cell, + "quantity": quantity, + "unit_price": unit_price, + "extended_price": extended_price, + } + ) + + return items + + +def _extract_totals_plain(text: str) -> dict: + """Extract totals from plain text (no HTML).""" + totals: dict = { + "subtotal": None, + "tax": None, + "total": None, + "savings_total": None, + } + + match = re.search(r"\bSubtotal\b[:\s$]*([0-9,]+\.?\d*)", text, re.IGNORECASE) + if match: + totals["subtotal"] = _to_decimal(match.group(1)) + + match = re.search(r"\bTax\b[:\s$]*([0-9,]+\.?\d*)", text, re.IGNORECASE) + if match: + totals["tax"] = _to_decimal(match.group(1)) + + grand_total_match = re.search(r"Grand\s+Total\b[:\s$]*([0-9,]+\.?\d*)", text, re.IGNORECASE) + if grand_total_match: + totals["total"] = _to_decimal(grand_total_match.group(1)) + + savings_match = re.search(r"\bSavings\b[:\s$\-]*([0-9,]+\.?\d*)", text, re.IGNORECASE) + if savings_match: + totals["savings_total"] = _to_decimal(savings_match.group(1)) + + if totals["total"] is None: + total_match = re.search(r"\bTotal\b[:\s$]*([0-9,]+\.?\d*)", text, re.IGNORECASE) + if total_match: + totals["total"] = _to_decimal(total_match.group(1)) + + return totals + + +def _extract_totals(soup: BeautifulSoup) -> dict: + """Extract subtotal, tax, total, and savings from the totals section.""" + text = soup.get_text() + + totals: dict = { + "subtotal": None, + "tax": None, + "total": None, + "savings_total": None, + } + + # Subtotal — use word boundary to avoid matching "Subtotal" with "Total" + match = re.search(r"\bSubtotal\b[:\s$]*([0-9,]+\.?\d*)", text, re.IGNORECASE) + if match: + totals["subtotal"] = _to_decimal(match.group(1)) + + # Tax + match = re.search(r"\bTax\b[:\s$]*([0-9,]+\.?\d*)", text, re.IGNORECASE) + if match: + totals["tax"] = _to_decimal(match.group(1)) + + # Grand Total (before plain "Total" to avoid matching "Subtotal") + grand_total_match = re.search(r"Grand\s+Total\b[:\s$]*([0-9,]+\.?\d*)", text, re.IGNORECASE) + if grand_total_match: + totals["total"] = _to_decimal(grand_total_match.group(1)) + + # Savings — allow any combination of whitespace/$- around the number + savings_match = re.search(r"\bSavings\b[:\s$\-]*([0-9,]+\.?\d*)", text, re.IGNORECASE) + if savings_match: + totals["savings_total"] = _to_decimal(savings_match.group(1)) + + # Plain "Total" only if Grand Total wasn't found + if totals["total"] is None: + total_match = re.search(r"\bTotal\b[:\s$]*([0-9,]+\.?\d*)", text, re.IGNORECASE) + if total_match: + totals["total"] = _to_decimal(total_match.group(1)) + + return totals + + +class MeijerEmailParser(BaseEmailParser): + """Parse Meijer digital receipt emails forwarded by users.""" + + def can_parse(self, email: EmailReceipt) -> bool: + sender = email.sender.lower().strip() + # Extract email from "Name " format + match = re.search(r"<([^>]+)>", sender) + if match: + sender = match.group(1) + return "meijer" in sender + + def parse(self, email: EmailReceipt) -> dict: + body_html = email.body_html + body_plain = email.body_plain or "" + body = body_html or body_plain + soup = BeautifulSoup(body, "html.parser") + + receipt_id = _extract_receipt_id(soup, email.subject) + purchase_date = _extract_purchase_date(soup, email.subject) + _ = _extract_store_info(soup) + + # Find the items table — look for one with Item/Qty/Price headers + table = None + for tbl in soup.find_all("table"): + headers = tbl.find_all("th") + header_texts = [h.get_text(strip=True).lower() for h in headers] + if any("item" in h or "qty" in h or "price" in h for h in header_texts): + table = tbl + break + + items = _extract_items(table) + + # Extract totals from HTML; fall back to plain text if no HTML + if body_html: + totals = _extract_totals(soup) + else: + totals = _extract_totals_plain(body_plain) + + return { + "receipt_id": receipt_id or "", + "purchase_date": purchase_date or "", + "total": totals["total"] or Decimal("0"), + "subtotal": totals["subtotal"], + "tax": totals["tax"], + "savings_total": totals["savings_total"], + "items": items, + } diff --git a/src/receiptwitness/parsers/email/target.py b/src/receiptwitness/parsers/email/target.py new file mode 100644 index 0000000..c7e58d3 --- /dev/null +++ b/src/receiptwitness/parsers/email/target.py @@ -0,0 +1,156 @@ +"""Target email receipt parser.""" + +import logging +import re +from datetime import datetime +from decimal import Decimal, InvalidOperation + +from bs4 import BeautifulSoup + +from receiptwitness.parsers.email.base import BaseEmailParser, EmailReceipt + +logger = logging.getLogger(__name__) + + +def _to_decimal(value: str | float | int | None, default: Decimal = Decimal("0")) -> Decimal: + """Safely convert a value to Decimal.""" + if value is None: + return default + try: + return Decimal(str(value).replace("$", "").replace(",", "").strip()) + except (InvalidOperation, ValueError): + return default + + +def _extract_total(body: str) -> Decimal: + """Extract the transaction total from email body.""" + patterns = [ + r"Total[:\s]*\$?([0-9,]+\.[0-9]{2})", + r"Amount[:\s]*\$?([0-9,]+\.[0-9]{2})", + r"Grand\s+Total[:\s]*\$?([0-9,]+\.[0-9]{2})", + ] + for pattern in patterns: + match = re.search(pattern, body, re.IGNORECASE) + if match: + return _to_decimal(match.group(1)) + return Decimal("0") + + +def _extract_receipt_id(body: str) -> str | None: + """Extract receipt ID / transaction ID from HTML body. + + Strips HTML tags first so that whitespace between delimiters and values + (e.g. from `` TGT-2026-0318-9124`` -> `` TGT-2026-0318-9124``) + is normalized and the pattern can match cleanly. + """ + stripped = re.sub(r"<[^>]+>", "", body) + patterns = [ + r"Receipt\s*#[:\s]*([A-Z0-9-]+)", + r"Order\s*#[:\s]*([A-Z0-9-]+)", + r"Confirmation\s*#[:\s]*([A-Z0-9-]+)", + r"Target\s+Order\s*#[:\s]*([A-Z0-9-]+)", + ] + for pattern in patterns: + match = re.search(pattern, stripped, re.IGNORECASE) + if match: + return match.group(1) + return None + + +def _extract_date(body: str) -> str: + """Extract purchase date from email body. Returns ISO date string or empty string.""" + patterns = [ + r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})", + r"([A-Z][a-z]{2}\s+\d{1,2},?\s+\d{4})", + ] + for pattern in patterns: + match = re.search(pattern, body) + if match: + raw = match.group(1) + try: + dt = datetime.strptime(raw.replace(",", ""), "%b %d %Y") + return dt.strftime("%Y-%m-%d") + except ValueError: + pass + try: + for fmt in ("%m/%d/%Y", "%m/%d/%y", "%d/%m/%Y", "%d/%m/%y"): + try: + dt = datetime.strptime(raw, fmt) + return dt.strftime("%Y-%m-%d") + except ValueError: + continue + except Exception: + pass + return "" + + +def _extract_items_soup(body: str) -> list[dict]: + """Extract line items from HTML email body using BeautifulSoup.""" + items = [] + try: + soup = BeautifulSoup(body, "html.parser") + text = soup.get_text(separator="\n", strip=True) + for line in text.split("\n"): + line = line.strip() + if not line or line.startswith( + ( + "Subtotal", + "Tax", + "Total", + "Target", + "Kroger", + "Date", + "Receipt", + "Order", + "Transaction", + "Confirmation", + "Thank", + "Questions", + "Keep", + "Receipt", + "Store", + ) + ): + continue + # Match lines like "Product Name $9.99" + match = re.match(r"(.+?)\s+\$([0-9]+\.[0-9]{2})\s*$", line) + if match: + name = match.group(1).strip() + price = _to_decimal(match.group(2)) + if len(name) > 2 and price > 0: + items.append( + { + "product_name_raw": name, + "quantity": Decimal("1"), + "unit_price": price, + "extended_price": price, + } + ) + except Exception: + pass + return items[:20] + + +class TargetEmailParser(BaseEmailParser): + """Parse Target email receipts (Circle order confirmations).""" + + TARGET_KEYWORDS = ("target.com", "targetnow", "circle", "target") + + def can_parse(self, email: EmailReceipt) -> bool: + sender = (email.sender or "").lower() + body = (email.body_html or email.body_plain or "").lower() + return any(kw in sender or kw in body for kw in self.TARGET_KEYWORDS) + + def parse(self, email: EmailReceipt) -> dict: + body = (email.body_html or email.body_plain or "").strip() + total = _extract_total(body) + receipt_id = _extract_receipt_id(body) or "" + purchase_date = _extract_date(body) + items = _extract_items_soup(body) + + return { + "receipt_id": receipt_id, + "purchase_date": purchase_date, + "total": total, + "items": items, + } diff --git a/src/receiptwitness/parsers/kroger.py b/src/receiptwitness/parsers/kroger.py new file mode 100644 index 0000000..13e5a20 --- /dev/null +++ b/src/receiptwitness/parsers/kroger.py @@ -0,0 +1,148 @@ +"""Kroger receipt parser. + +Transforms raw Kroger receipt JSON into the common PurchaseCreate schema. +Kroger receipt data uses different field names than Meijer — this parser +handles Kroger-specific naming conventions and receipt structure. +""" + +import logging +from decimal import Decimal, InvalidOperation + +from receiptwitness.scrapers.base import RawReceipt + +logger = logging.getLogger(__name__) + + +def _to_decimal(value, default: str = "0") -> Decimal: + """Safely convert a value to Decimal.""" + if value is None: + return Decimal(default) + try: + return Decimal(str(value)) + except (InvalidOperation, ValueError, TypeError): + return Decimal(default) + + +def _parse_item(item: dict) -> dict: + """Parse a single line item from a Kroger receipt. + + Kroger items typically include fields like: + - description / itemDescription / productName + - upc / krogerProductId + - quantity / qty + - basePrice / unitPrice / price + - totalPrice / extendedAmount / lineTotal + - regularPrice / originalPrice + - salePrice / promoPrice + - couponAmount / couponSavings + - loyaltyDiscount / fuelPointsDiscount / plusCardSavings + - department / category / aisle + """ + description = ( + item.get("description") + or item.get("itemDescription") + or item.get("productName") + or item.get("name") + or "UNKNOWN ITEM" + ) + + quantity = _to_decimal(item.get("quantity", item.get("qty", item.get("quantitySold", 1))), "1") + unit_price = _to_decimal(item.get("basePrice", item.get("unitPrice", item.get("price", 0)))) + extended_price = _to_decimal( + item.get("totalPrice", item.get("extendedAmount", item.get("lineTotal"))) + ) + + # Compute extended_price if not provided + if extended_price == Decimal("0") and unit_price != Decimal("0"): + extended_price = unit_price * quantity + + regular_price = item.get("regularPrice", item.get("originalPrice")) + sale_price = item.get("salePrice", item.get("promoPrice")) + coupon_discount = item.get( + "couponAmount", item.get("couponSavings", item.get("couponDiscount")) + ) + loyalty_discount = item.get( + "plusCardSavings", + item.get("loyaltyDiscount", item.get("fuelPointsDiscount")), + ) + + # UPC handling — Kroger may use krogerProductId or upc + upc = item.get("upc", item.get("UPC", item.get("krogerProductId"))) + if upc: + upc = str(upc).strip().lstrip("0") or None + + category = item.get("department", item.get("category", item.get("aisle"))) + + # Weight info for produce/deli items + weight = item.get("weight", item.get("netWeight")) + extra = {} + if weight is not None: + extra["weight"] = str(weight) + weight_uom = item.get("weightUom", item.get("unitOfMeasure")) + if weight_uom: + extra["weight_uom"] = weight_uom + + result = { + "product_name_raw": description.strip(), + "upc": upc, + "quantity": quantity, + "unit_price": unit_price, + "extended_price": extended_price, + "regular_price": (_to_decimal(regular_price) if regular_price is not None else None), + "sale_price": (_to_decimal(sale_price) if sale_price is not None else None), + "coupon_discount": (_to_decimal(coupon_discount) if coupon_discount is not None else None), + "loyalty_discount": ( + _to_decimal(loyalty_discount) if loyalty_discount is not None else None + ), + "category_raw": category.strip() if category else None, + } + + return result + + +def parse_kroger_receipt(raw: RawReceipt) -> dict: + """Parse a RawReceipt from Kroger into a PurchaseCreate-compatible dict.""" + data = raw.raw_data + detail = data.get("detail", {}) + + # Parse items — Kroger uses "items" or "lineItems" or "receiptItems" + raw_items = detail.get("items", detail.get("lineItems", detail.get("receiptItems", []))) + items = [] + for raw_item in raw_items: + # Skip voided / returned items + if raw_item.get("voided") or raw_item.get("status") in ( + "VOIDED", + "RETURNED", + ): + logger.debug("Skipping voided/returned item: %s", raw_item.get("description")) + continue + if raw_item.get("returnFlag") or raw_item.get("isReturn"): + logger.debug("Skipping returned item: %s", raw_item.get("description")) + continue + items.append(_parse_item(raw_item)) + + # Parse totals — Kroger uses various field names + total = _to_decimal( + detail.get( + "total", + data.get("total", data.get("orderTotal", data.get("grandTotal", 0))), + ) + ) + subtotal = detail.get("subtotal", data.get("subtotal", data.get("subTotal"))) + tax = detail.get("tax", data.get("tax", data.get("salesTax"))) + savings = detail.get( + "totalSavings", + data.get("savings", data.get("totalDiscount", data.get("youSaved"))), + ) + + return { + "receipt_id": raw.receipt_id, + "purchase_date": raw.purchase_date, + "total": total, + "subtotal": _to_decimal(subtotal) if subtotal is not None else None, + "tax": _to_decimal(tax) if tax is not None else None, + "savings_total": _to_decimal(savings) if savings is not None else None, + "source_url": raw.source_url, + "raw_data": data, + "items": items, + } diff --git a/src/receiptwitness/parsers/meijer.py b/src/receiptwitness/parsers/meijer.py new file mode 100644 index 0000000..d1960d0 --- /dev/null +++ b/src/receiptwitness/parsers/meijer.py @@ -0,0 +1,138 @@ +"""Parse raw Meijer mPerks receipt data into PurchaseCreate-compatible dicts. + +The mPerks receipt JSON structure (reverse-engineered from their SPA) +typically looks like: + +Transaction listing: +{ + "transactions": [ + { + "transactionId": "12345", + "transactionDate": "2026-03-10T14:30:00Z", + "storeNumber": "123", + "total": 87.42, + "savings": 12.50 + } + ] +} + +Receipt detail: +{ + "receiptId": "12345", + "items": [ + { + "description": "ORGANIC BANANAS", + "upc": "0000000004011", + "quantity": 1, + "price": 0.69, + "extendedPrice": 0.69, + "regularPrice": 0.79, + "salePrice": 0.69, + "couponDiscount": 0.0, + "mperksDiscount": 0.10, + "category": "PRODUCE" + } + ], + "subtotal": 74.92, + "tax": 5.24, + "total": 87.42, + "totalSavings": 12.50 +} +""" + +import logging +from decimal import Decimal, InvalidOperation + +from receiptwitness.scrapers.base import RawReceipt + +logger = logging.getLogger(__name__) + + +def _to_decimal(value, default: str = "0") -> Decimal: + """Safely convert a value to Decimal.""" + if value is None: + return Decimal(default) + try: + return Decimal(str(value)) + except (InvalidOperation, ValueError, TypeError): + return Decimal(default) + + +def _parse_item(item: dict) -> dict: + """Parse a single line item from Meijer receipt detail.""" + description = ( + item.get("description") or item.get("itemDescription") or item.get("name") or "UNKNOWN ITEM" + ) + + quantity = _to_decimal(item.get("quantity", item.get("qty", 1)), "1") + unit_price = _to_decimal(item.get("price", item.get("unitPrice", 0))) + extended_price = _to_decimal(item.get("extendedPrice", item.get("totalPrice"))) + + # If extended_price wasn't provided, compute it + if extended_price == Decimal("0") and unit_price != Decimal("0"): + extended_price = unit_price * quantity + + regular_price = item.get("regularPrice") + sale_price = item.get("salePrice") + coupon_discount = item.get("couponDiscount", item.get("couponSavings")) + loyalty_discount = item.get("mperksDiscount", item.get("loyaltyDiscount")) + + upc = item.get("upc", item.get("UPC")) + if upc: + upc = str(upc).strip().lstrip("0") or None + + category = item.get("category", item.get("departmentDescription")) + + return { + "product_name_raw": description.strip(), + "upc": upc, + "quantity": quantity, + "unit_price": unit_price, + "extended_price": extended_price, + "regular_price": _to_decimal(regular_price) if regular_price is not None else None, + "sale_price": _to_decimal(sale_price) if sale_price is not None else None, + "coupon_discount": (_to_decimal(coupon_discount) if coupon_discount is not None else None), + "loyalty_discount": ( + _to_decimal(loyalty_discount) if loyalty_discount is not None else None + ), + "category_raw": category.strip() if category else None, + } + + +def parse_meijer_receipt(raw: RawReceipt) -> dict: + """Parse a RawReceipt from Meijer into a PurchaseCreate-compatible dict. + + Returns a dict with keys matching PurchaseCreate schema fields. + The caller is responsible for setting store_id and store_location_id + from the store registry. + """ + data = raw.raw_data + detail = data.get("detail", {}) + + # Parse items from the detail response + raw_items = detail.get("items", detail.get("lineItems", [])) + items = [] + for raw_item in raw_items: + # Skip voided items + if raw_item.get("voided") or raw_item.get("status") == "VOIDED": + logger.debug("Skipping voided item: %s", raw_item.get("description")) + continue + items.append(_parse_item(raw_item)) + + # Parse totals + total = _to_decimal(detail.get("total", data.get("total", data.get("transactionTotal", 0)))) + subtotal = detail.get("subtotal", data.get("subtotal")) + tax = detail.get("tax", data.get("tax")) + savings = detail.get("totalSavings", data.get("savings", data.get("totalDiscount"))) + + return { + "receipt_id": raw.receipt_id, + "purchase_date": raw.purchase_date, + "total": total, + "subtotal": _to_decimal(subtotal) if subtotal is not None else None, + "tax": _to_decimal(tax) if tax is not None else None, + "savings_total": _to_decimal(savings) if savings is not None else None, + "source_url": raw.source_url, + "raw_data": data, + "items": items, + } diff --git a/src/receiptwitness/parsers/target.py b/src/receiptwitness/parsers/target.py new file mode 100644 index 0000000..25b4204 --- /dev/null +++ b/src/receiptwitness/parsers/target.py @@ -0,0 +1,191 @@ +"""Target Circle receipt parser. + +Transforms raw Target in-store receipt JSON into the common PurchaseCreate schema. +Target receipt data includes Circle pricing, BOGO deals, and Circle rewards +discounts that need special handling. + +Target receipt detail structure (reverse-engineered from target.com SPA): + +{ + "orderId": "TGT-2026-0315-7890", + "items": [ + { + "description": "GOOD & GATHER WHOLE MILK GAL", + "tcin": "14767459", + "upc": "0085239100123", + "quantity": 1, + "unitPrice": 3.89, + "totalPrice": 3.89, + "regularPrice": 4.19, + "circlePrice": 3.89, + "couponDiscount": 0.0, + "circleRewardsDiscount": 0.30, + "promoDescription": "Circle offer: Save 30c", + "department": "GROCERY" + } + ], + "subtotal": 78.32, + "tax": 4.89, + "total": 83.21, + "totalSavings": 11.45 +} +""" + +import logging +from decimal import Decimal, InvalidOperation + +from receiptwitness.scrapers.base import RawReceipt + +logger = logging.getLogger(__name__) + + +def _to_decimal(value, default: str = "0") -> Decimal: + """Safely convert a value to Decimal.""" + if value is None: + return Decimal(default) + try: + return Decimal(str(value)) + except (InvalidOperation, ValueError, TypeError): + return Decimal(default) + + +def _parse_item(item: dict) -> dict: + """Parse a single line item from a Target receipt. + + Target items may include fields like: + - description / itemDescription / productName + - tcin (Target internal product ID) / upc / dpci + - quantity / qty + - unitPrice / price + - totalPrice / extendedPrice / lineTotal + - regularPrice / originalPrice + - circlePrice / salePrice / promoPrice + - couponDiscount / couponSavings + - circleRewardsDiscount / circleDiscount / loyaltyDiscount + - promoDescription / offerDescription (e.g. "BOGO 50% off", "Circle offer") + - department / category + """ + description = ( + item.get("description") + or item.get("itemDescription") + or item.get("productName") + or item.get("name") + or "UNKNOWN ITEM" + ) + + quantity = _to_decimal(item.get("quantity", item.get("qty", item.get("quantitySold", 1))), "1") + unit_price = _to_decimal(item.get("unitPrice", item.get("price", item.get("basePrice", 0)))) + extended_price = _to_decimal( + item.get("totalPrice", item.get("extendedPrice", item.get("lineTotal"))) + ) + + # Compute extended_price if not provided + if extended_price == Decimal("0") and unit_price != Decimal("0"): + extended_price = unit_price * quantity + + regular_price = item.get("regularPrice", item.get("originalPrice")) + # Target Circle pricing — circlePrice takes precedence over generic salePrice + sale_price = item.get("circlePrice", item.get("salePrice", item.get("promoPrice"))) + coupon_discount = item.get( + "couponDiscount", item.get("couponSavings", item.get("couponAmount")) + ) + # Circle rewards / loyalty discount + loyalty_discount = item.get( + "circleRewardsDiscount", + item.get("circleDiscount", item.get("loyaltyDiscount")), + ) + + # UPC handling — Target may use tcin, upc, or dpci + upc = item.get("upc", item.get("UPC")) + if upc: + upc = str(upc).strip().lstrip("0") or None + + # Target also has TCIN (Target.com Item Number) and DPCI (Department/Class/Item) + tcin = item.get("tcin", item.get("TCIN")) + dpci = item.get("dpci", item.get("DPCI")) + + category = item.get("department", item.get("category")) + + # Capture promo/deal description for BOGO and Circle offers + promo_description = item.get("promoDescription", item.get("offerDescription")) + + # Weight info for produce/deli items + weight = item.get("weight", item.get("netWeight")) + extra: dict = {} + if weight is not None: + extra["weight"] = str(weight) + weight_uom = item.get("weightUom", item.get("unitOfMeasure")) + if weight_uom: + extra["weight_uom"] = weight_uom + if tcin: + extra["tcin"] = str(tcin) + if dpci: + extra["dpci"] = str(dpci) + if promo_description: + extra["promo_description"] = promo_description + + result: dict = { + "product_name_raw": description.strip(), + "upc": upc, + "quantity": quantity, + "unit_price": unit_price, + "extended_price": extended_price, + "regular_price": _to_decimal(regular_price) if regular_price is not None else None, + "sale_price": _to_decimal(sale_price) if sale_price is not None else None, + "coupon_discount": (_to_decimal(coupon_discount) if coupon_discount is not None else None), + "loyalty_discount": ( + _to_decimal(loyalty_discount) if loyalty_discount is not None else None + ), + "category_raw": category.strip() if category else None, + } + + return result + + +def parse_target_receipt(raw: RawReceipt) -> dict: + """Parse a RawReceipt from Target into a PurchaseCreate-compatible dict.""" + data = raw.raw_data + detail = data.get("detail", {}) + + # Parse items — Target uses "items" or "lineItems" + raw_items = detail.get("items", detail.get("lineItems", [])) + items = [] + for raw_item in raw_items: + # Skip voided / returned items + if raw_item.get("voided") or raw_item.get("status") in ( + "VOIDED", + "RETURNED", + "CANCELLED", + ): + logger.debug("Skipping voided/returned item: %s", raw_item.get("description")) + continue + if raw_item.get("returnFlag") or raw_item.get("isReturn"): + logger.debug("Skipping returned item: %s", raw_item.get("description")) + continue + items.append(_parse_item(raw_item)) + + # Parse totals + total = _to_decimal( + detail.get( + "total", + data.get("total", data.get("orderTotal", data.get("grandTotal", 0))), + ) + ) + subtotal = detail.get("subtotal", data.get("subtotal", data.get("subTotal"))) + tax = detail.get("tax", data.get("tax", data.get("salesTax"))) + savings = detail.get( + "totalSavings", + data.get("savings", data.get("totalDiscount", data.get("circleSavings"))), + ) + + return { + "receipt_id": raw.receipt_id, + "purchase_date": raw.purchase_date, + "total": total, + "subtotal": _to_decimal(subtotal) if subtotal is not None else None, + "tax": _to_decimal(tax) if tax is not None else None, + "savings_total": _to_decimal(savings) if savings is not None else None, + "source_url": raw.source_url, + "raw_data": data, + "items": items, + } diff --git a/src/receiptwitness/pipeline/__init__.py b/src/receiptwitness/pipeline/__init__.py new file mode 100644 index 0000000..e590387 --- /dev/null +++ b/src/receiptwitness/pipeline/__init__.py @@ -0,0 +1,30 @@ +"""Receipt & product matching pipeline — receipt normalization and product dedup.""" + +from receiptwitness.pipeline.matching import ( + ConfidenceLevel, + ProductMatcher, + match_purchase_item, +) +from receiptwitness.pipeline.normalization import ( + MatchMethod, + MatchResult, + clean_name, + extract_size_info, + jaccard_similarity, + normalize_product, +) +from receiptwitness.pipeline.receipt import normalize_receipt, parse_meijer_item + +__all__ = [ + "ConfidenceLevel", + "MatchMethod", + "MatchResult", + "ProductMatcher", + "clean_name", + "extract_size_info", + "jaccard_similarity", + "match_purchase_item", + "normalize_product", + "normalize_receipt", + "parse_meijer_item", +] diff --git a/src/receiptwitness/pipeline/matching.py b/src/receiptwitness/pipeline/matching.py new file mode 100644 index 0000000..7e71039 --- /dev/null +++ b/src/receiptwitness/pipeline/matching.py @@ -0,0 +1,136 @@ +"""Product matching & dedup — UPC primary, fuzzy name fallback, confidence scoring. + +Wraps the Phase 1 normalization module with confidence-level classification +and batch matching for purchase ingestion. +""" + +import uuid +from dataclasses import dataclass + +from cartsnitch_common.constants import MatchConfidence +from cartsnitch_common.models.product import NormalizedProduct +from cartsnitch_common.schemas.purchase import PurchaseItemCreate +from sqlalchemy.orm import Session + +from receiptwitness.pipeline.normalization import ( + MatchMethod, + MatchResult, + extract_size_info, + normalize_product, +) + +# Re-export for convenience +ConfidenceLevel = MatchConfidence + + +@dataclass(frozen=True) +class MatchOutcome: + """Result of matching a single purchase item to a normalized product.""" + + item_index: int + match: MatchResult | None + confidence_level: MatchConfidence + created_new: bool = False + + +def classify_confidence(score: float, method: MatchMethod) -> MatchConfidence: + """Classify a match score into high/medium/low confidence.""" + if method == MatchMethod.UPC: + return MatchConfidence.HIGH + # Name-based matching thresholds + if score >= 0.8: + return MatchConfidence.HIGH + if score >= 0.5: + return MatchConfidence.MEDIUM + return MatchConfidence.LOW + + +def _create_product_from_item( + session: Session, + item: PurchaseItemCreate, +) -> NormalizedProduct: + """Create a new NormalizedProduct from a purchase item that had no match.""" + size_info = extract_size_info(item.product_name_raw) + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name=item.product_name_raw, + size=size_info[0] if size_info else None, + size_unit=size_info[1] if size_info else None, + upc_variants=[item.upc] if item.upc else [], + ) + session.add(product) + session.flush() + return product + + +class ProductMatcher: + """Batch product matcher for purchase ingestion. + + Usage: + matcher = ProductMatcher(session) + outcomes = matcher.match_items(items) + """ + + def __init__( + self, + session: Session, + name_threshold: float = 0.4, + auto_create: bool = True, + ): + self.session = session + self.name_threshold = name_threshold + self.auto_create = auto_create + + def match_single( + self, + item: PurchaseItemCreate, + ) -> tuple[NormalizedProduct | None, MatchResult | None, MatchConfidence]: + """Match a single purchase item to a normalized product. + + Returns (product, match_result, confidence_level). + If auto_create is True and no match found, creates a new product. + """ + result = normalize_product( + self.session, + item.product_name_raw, + upc=item.upc, + name_threshold=self.name_threshold, + ) + + if result: + confidence = classify_confidence(result.confidence, result.method) + return result.product, result, confidence + + if self.auto_create: + product = _create_product_from_item(self.session, item) + return product, None, MatchConfidence.LOW + + return None, None, MatchConfidence.LOW + + def match_items(self, items: list[PurchaseItemCreate]) -> list[MatchOutcome]: + """Match a batch of purchase items. Returns outcomes in order.""" + outcomes: list[MatchOutcome] = [] + for idx, item in enumerate(items): + product, result, confidence = self.match_single(item) + created = result is None and product is not None + outcomes.append( + MatchOutcome( + item_index=idx, + match=result, + confidence_level=confidence, + created_new=created, + ) + ) + return outcomes + + +def match_purchase_item( + session: Session, + item: PurchaseItemCreate, + name_threshold: float = 0.4, + auto_create: bool = True, +) -> tuple[NormalizedProduct | None, MatchConfidence]: + """Convenience function: match a single item, return (product, confidence).""" + matcher = ProductMatcher(session, name_threshold=name_threshold, auto_create=auto_create) + product, _, confidence = matcher.match_single(item) + return product, confidence diff --git a/src/receiptwitness/pipeline/normalization.py b/src/receiptwitness/pipeline/normalization.py new file mode 100644 index 0000000..a714020 --- /dev/null +++ b/src/receiptwitness/pipeline/normalization.py @@ -0,0 +1,164 @@ +"""Product normalization — Phase 1: UPC matching + fuzzy name matching. + +Matches products across retailers by: +1. Exact UPC match (highest confidence) +2. Fuzzy name matching via token-based Jaccard similarity (lower confidence) +""" + +import json +import re +from dataclasses import dataclass +from enum import StrEnum + +from cartsnitch_common.models.product import NormalizedProduct +from sqlalchemy import cast, func, select, String +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.orm import Session + + +class MatchMethod(StrEnum): + """How a product match was determined.""" + + UPC = "upc" + NAME = "name" + + +@dataclass(frozen=True) +class MatchResult: + """Result of a product normalization attempt.""" + + product: NormalizedProduct + confidence: float + method: MatchMethod + + +# Noise words stripped during name cleaning +_NOISE_WORDS = frozenset( + { + "the", + "a", + "an", + "and", + "or", + "of", + "with", + "in", + "for", + "to", + "brand", + "original", + "classic", + "new", + "improved", + } +) + +# Regex for extracting size info (e.g., "16 oz", "1.5 lb", "12 ct") +_SIZE_PATTERN = re.compile( + r"(\d+(?:\.\d+)?)\s*(oz|fl\s*oz|lb|lbs|g|kg|ml|l|ct|pk|count|pack)\b", + re.IGNORECASE, +) + + +def clean_name(name: str) -> str: + """Normalize a product name for comparison. + + - Lowercase + - Remove size info (e.g., "16 oz") + - Strip noise words + - Collapse whitespace + """ + cleaned = name.lower() + cleaned = _SIZE_PATTERN.sub("", cleaned) + cleaned = re.sub(r"[^\w\s]", " ", cleaned) + tokens = cleaned.split() + tokens = [t for t in tokens if t not in _NOISE_WORDS] + return " ".join(tokens) + + +def extract_size_info(name: str) -> tuple[str, str] | None: + """Extract (size, unit) from a product name, if present.""" + match = _SIZE_PATTERN.search(name) + if match: + return match.group(1), match.group(2).lower().replace(" ", "_") + return None + + +def jaccard_similarity(a: str, b: str) -> float: + """Token-based Jaccard similarity between two cleaned names.""" + tokens_a = set(a.split()) + tokens_b = set(b.split()) + if not tokens_a or not tokens_b: + return 0.0 + intersection = tokens_a & tokens_b + union = tokens_a | tokens_b + return len(intersection) / len(union) + + +def match_by_upc(session: Session, upc: str) -> MatchResult | None: + """Find a normalized product by exact UPC match. + + Uses PostgreSQL JSONB containment (@>) for production efficiency. + Falls back to LIKE on SQLite for test compatibility. + """ + dialect_name = session.bind.dialect.name if session.bind else "default" + if dialect_name == "postgresql": + stmt = select(NormalizedProduct).where( + cast(NormalizedProduct.upc_variants, JSONB).op("@>")( + func.cast(json.dumps([upc]), JSONB) + ) + ) + else: + stmt = select(NormalizedProduct).where( + NormalizedProduct.upc_variants.is_not(None), + cast(NormalizedProduct.upc_variants, String).contains(upc), + ) + product = session.execute(stmt).scalars().first() + if product: + return MatchResult(product=product, confidence=1.0, method=MatchMethod.UPC) + return None + + +def match_by_name( + session: Session, + name: str, + threshold: float = 0.5, +) -> MatchResult | None: + """Find the best normalized product by fuzzy name matching. + + Loads all normalized products and computes Jaccard similarity. + Returns the best match above the threshold, or None. + """ + # TODO: Use pg_trgm similarity index for production. + # Current approach loads all products into memory — acceptable for tests + # and small datasets, but will not scale. + cleaned = clean_name(name) + stmt = select(NormalizedProduct) + products = session.execute(stmt).scalars().all() + + best_match: NormalizedProduct | None = None + best_score = 0.0 + + for product in products: + score = jaccard_similarity(cleaned, clean_name(product.canonical_name)) + if score > best_score and score >= threshold: + best_score = score + best_match = product + + if best_match: + return MatchResult(product=best_match, confidence=best_score, method=MatchMethod.NAME) + return None + + +def normalize_product( + session: Session, + name: str, + upc: str | None = None, + name_threshold: float = 0.5, +) -> MatchResult | None: + """Full normalization pipeline: UPC first, then fuzzy name fallback.""" + if upc: + result = match_by_upc(session, upc) + if result: + return result + return match_by_name(session, name, threshold=name_threshold) diff --git a/src/receiptwitness/pipeline/receipt.py b/src/receiptwitness/pipeline/receipt.py new file mode 100644 index 0000000..7d3e863 --- /dev/null +++ b/src/receiptwitness/pipeline/receipt.py @@ -0,0 +1,144 @@ +"""Receipt normalization — parse raw Meijer scraper output into purchase records. + +Maps raw receipt fields, cleans product names, extracts quantities/units. +""" + +import re +from datetime import date +from decimal import Decimal, InvalidOperation + +from cartsnitch_common.schemas.purchase import PurchaseCreate, PurchaseItemCreate + + +def _clean_product_name(raw: str) -> str: + """Clean raw product name from scraper output.""" + cleaned = raw.strip() + # Remove leading/trailing non-alphanumeric chars + cleaned = re.sub(r"^\W+|\W+$", "", cleaned) + # Collapse internal whitespace + cleaned = re.sub(r"\s+", " ", cleaned) + return cleaned + + +def _safe_decimal( + value: str | float | int | Decimal | None, + default: Decimal = Decimal("0"), +) -> Decimal: + """Safely convert a value to Decimal.""" + if value is None: + return default + try: + return Decimal(str(value)) + except (InvalidOperation, ValueError): + return default + + +def parse_meijer_item(raw_item: dict) -> PurchaseItemCreate: + """Parse a single Meijer scraper line item into a PurchaseItemCreate. + + Expected raw_item keys (from Meijer scraper): + - description / name: product name + - upc / upcCode: UPC barcode + - quantity / qty: number of units + - unitPrice / price: per-unit price + - extendedPrice / totalPrice: line total + - regularPrice: shelf price before discounts + - salePrice: sale price if applicable + - couponAmount / couponDiscount: coupon savings + - loyaltyAmount / loyaltyDiscount: loyalty savings + - category / department: raw category + """ + name = raw_item.get("description") or raw_item.get("name") or "" + cleaned_name = _clean_product_name(name) + + upc = raw_item.get("upc") or raw_item.get("upcCode") + if upc: + upc = str(upc).strip().lstrip("0") or str(upc).strip() + + qty = _safe_decimal( + raw_item.get("quantity") or raw_item.get("qty"), + default=Decimal("1"), + ) + + unit_price = _safe_decimal(raw_item.get("unitPrice") or raw_item.get("price")) + extended = _safe_decimal(raw_item.get("extendedPrice") or raw_item.get("totalPrice")) + if extended == Decimal("0") and unit_price > 0: + extended = unit_price * qty + + regular = raw_item.get("regularPrice") + sale = raw_item.get("salePrice") + coupon = raw_item.get("couponAmount") or raw_item.get("couponDiscount") + loyalty = raw_item.get("loyaltyAmount") or raw_item.get("loyaltyDiscount") + category = raw_item.get("category") or raw_item.get("department") + + return PurchaseItemCreate( + product_name_raw=cleaned_name, + upc=upc, + quantity=qty, + unit_price=unit_price, + extended_price=extended, + regular_price=_safe_decimal(regular) if regular is not None else None, + sale_price=_safe_decimal(sale) if sale is not None else None, + coupon_discount=_safe_decimal(coupon) if coupon is not None else None, + loyalty_discount=_safe_decimal(loyalty) if loyalty is not None else None, + category_raw=str(category).strip() if category else None, + ) + + +def normalize_receipt( + raw_receipt: dict, + user_id: str, + store_id: str, +) -> PurchaseCreate: + """Parse a complete Meijer raw receipt into a PurchaseCreate. + + Expected raw_receipt keys: + - receiptId / receipt_id / id: unique receipt identifier + - date / purchaseDate / purchase_date: purchase date (YYYY-MM-DD or similar) + - total / totalAmount: receipt total + - subtotal: pre-tax subtotal + - tax / taxAmount: tax amount + - savings / totalSavings: total discount savings + - items: list of raw line item dicts + """ + import uuid + + receipt_id = str( + raw_receipt.get("receiptId") + or raw_receipt.get("receipt_id") + or raw_receipt.get("id") + or uuid.uuid4() + ) + + raw_date = ( + raw_receipt.get("date") + or raw_receipt.get("purchaseDate") + or raw_receipt.get("purchase_date") + ) + if isinstance(raw_date, str): + purchase_date = date.fromisoformat(raw_date[:10]) + elif isinstance(raw_date, date): + purchase_date = raw_date + else: + purchase_date = date.today() + + total = _safe_decimal(raw_receipt.get("total") or raw_receipt.get("totalAmount")) + subtotal = raw_receipt.get("subtotal") + tax = raw_receipt.get("tax") or raw_receipt.get("taxAmount") + savings = raw_receipt.get("savings") or raw_receipt.get("totalSavings") + + raw_items = raw_receipt.get("items") or [] + items = [parse_meijer_item(item) for item in raw_items] + + return PurchaseCreate( + user_id=uuid.UUID(user_id) if isinstance(user_id, str) else user_id, + store_id=uuid.UUID(store_id) if isinstance(store_id, str) else store_id, + receipt_id=receipt_id, + purchase_date=purchase_date, + total=total, + subtotal=_safe_decimal(subtotal) if subtotal is not None else None, + tax=_safe_decimal(tax) if tax is not None else None, + savings_total=_safe_decimal(savings) if savings is not None else None, + raw_data=raw_receipt, + items=items, + ) diff --git a/src/receiptwitness/queue/__init__.py b/src/receiptwitness/queue/__init__.py new file mode 100644 index 0000000..3f9a31f --- /dev/null +++ b/src/receiptwitness/queue/__init__.py @@ -0,0 +1 @@ +"""DragonflyDB Streams queue for email receipt processing.""" diff --git a/src/receiptwitness/queue/email.py b/src/receiptwitness/queue/email.py new file mode 100644 index 0000000..c76148e --- /dev/null +++ b/src/receiptwitness/queue/email.py @@ -0,0 +1,77 @@ +"""DragonflyDB Streams queue for email receipt processing.""" + +from __future__ import annotations + +import json +import logging +from dataclasses import asdict, dataclass +from typing import cast + +import redis.asyncio as aioredis + +from receiptwitness.config import settings + +logger = logging.getLogger(__name__) + +STREAM_KEY = "email:receipts" +CONSUMER_GROUP = "email-workers" + + +@dataclass +class EmailJob: + """Payload for an email receipt processing job.""" + + user_id: str + sender: str + recipient: str + subject: str + body_html: str | None + body_plain: str | None + received_at: str + message_id: str # from email provider, for dedup + + +async def get_redis() -> aioredis.Redis: + """Get async Redis/DragonflyDB client.""" + return cast(aioredis.Redis, aioredis.from_url(settings.redis_url, decode_responses=True)) + + +async def ensure_consumer_group(client: aioredis.Redis) -> None: + """Create consumer group if it does not exist.""" + try: + await client.xgroup_create(STREAM_KEY, CONSUMER_GROUP, id="0", mkstream=True) + except aioredis.ResponseError as e: + if "BUSYGROUP" not in str(e): + raise + + +async def enqueue_email(client: aioredis.Redis, job: EmailJob) -> str: + """Add email job to the stream. Returns the stream message ID.""" + payload: dict[str, str | bytes | int | float] = {"data": json.dumps(asdict(job))} + msg_id: str = cast(str, await client.xadd(STREAM_KEY, payload)) # type: ignore[arg-type] # redis-py StreamCommands.xadd expects broader FieldT union; runtime behavior is correct + logger.info("Enqueued email job %s for user %s", msg_id, job.user_id) + return msg_id + + +async def consume_emails( + client: aioredis.Redis, + consumer_name: str, + count: int = 1, + block_ms: int = 5000, +) -> list[tuple[str, EmailJob]]: + """Read pending messages from the stream. Returns list of (msg_id, EmailJob).""" + await ensure_consumer_group(client) + messages = await client.xreadgroup( + CONSUMER_GROUP, consumer_name, {STREAM_KEY: ">"}, count=count, block=block_ms + ) + results = [] + for _stream, entries in messages: + for msg_id, fields in entries: + job = EmailJob(**json.loads(fields["data"])) + results.append((msg_id, job)) + return results + + +async def ack_email(client: aioredis.Redis, msg_id: str) -> None: + """Acknowledge a processed message.""" + await client.xack(STREAM_KEY, CONSUMER_GROUP, msg_id) diff --git a/src/receiptwitness/scrapers/__init__.py b/src/receiptwitness/scrapers/__init__.py new file mode 100644 index 0000000..cfc8d9e --- /dev/null +++ b/src/receiptwitness/scrapers/__init__.py @@ -0,0 +1 @@ +"""Retailer scrapers.""" diff --git a/src/receiptwitness/scrapers/base.py b/src/receiptwitness/scrapers/base.py new file mode 100644 index 0000000..fd5fdc3 --- /dev/null +++ b/src/receiptwitness/scrapers/base.py @@ -0,0 +1,72 @@ +"""Abstract base scraper interface for all retailer scrapers.""" + +import asyncio +import random +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from datetime import datetime + +from receiptwitness.config import settings + + +@dataclass +class SessionData: + """Holds session cookies and metadata for a retailer login.""" + + cookies: list[dict] + user_agent: str + created_at: datetime + expires_at: datetime | None = None + extra: dict = field(default_factory=dict) + + +@dataclass +class RawReceipt: + """Raw receipt data before parsing.""" + + receipt_id: str + purchase_date: str + store_number: str | None = None + raw_data: dict = field(default_factory=dict) + source_url: str | None = None + + +class BaseScraper(ABC): + """All retailer scrapers implement this interface. + + Provides common functionality: human-like delays, rate limiting guards, + and the abstract methods each retailer scraper must implement. + """ + + @abstractmethod + async def login(self, username: str, password: str) -> SessionData: + """Authenticate with the retailer portal and return session data.""" + ... + + @abstractmethod + async def check_session(self, session: SessionData) -> bool: + """Verify if an existing session is still valid.""" + ... + + @abstractmethod + async def scrape_receipts( + self, session: SessionData, since: datetime | None = None + ) -> list[RawReceipt]: + """Scrape receipt data from the retailer portal.""" + ... + + @abstractmethod + def parse_receipt(self, raw: RawReceipt) -> dict: + """Parse a raw receipt into structured data. + + Returns a dict with keys matching PurchaseCreate schema fields, + including an 'items' list matching PurchaseItemCreate fields. + """ + ... + + async def human_delay(self, min_ms: int | None = None, max_ms: int | None = None) -> None: + """Sleep for a randomized human-like interval.""" + lo = min_ms or settings.min_request_delay_ms + hi = max_ms or settings.max_request_delay_ms + delay = random.randint(lo, hi) / 1000.0 + await asyncio.sleep(delay) diff --git a/src/receiptwitness/scrapers/kroger.py b/src/receiptwitness/scrapers/kroger.py new file mode 100644 index 0000000..a7993af --- /dev/null +++ b/src/receiptwitness/scrapers/kroger.py @@ -0,0 +1,344 @@ +"""Kroger loyalty portal scraper using Playwright. + +Kroger uses Akamai Bot Manager for aggressive headless browser detection. +This scraper uses enhanced stealth measures including playwright-stealth, +realistic fingerprinting, and human-like interaction pacing. +""" + +import logging +from datetime import UTC, datetime, timedelta +from typing import cast + +from playwright.async_api import BrowserContext, Page, Playwright, async_playwright + +from receiptwitness.config import settings +from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData + +logger = logging.getLogger(__name__) + +# Kroger endpoints +KROGER_BASE = "https://www.kroger.com" +KROGER_LOGIN_PAGE = f"{KROGER_BASE}/signin" +KROGER_PURCHASE_HISTORY = f"{KROGER_BASE}/mypurchases" +KROGER_RECEIPT_API = f"{KROGER_BASE}/atlas/v1/purchase-history/api" +KROGER_RECEIPT_DETAIL_API = f"{KROGER_BASE}/atlas/v1/receipt/api" +KROGER_ACCOUNT_PAGE = f"{KROGER_BASE}/account/dashboard" + +# Realistic browser fingerprint — Chrome on Windows (matches Kroger's typical audience) +DEFAULT_USER_AGENT = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/131.0.0.0 Safari/537.36" +) +DEFAULT_VIEWPORT = {"width": 1920, "height": 1080} +DEFAULT_LOCALE = "en-US" +DEFAULT_TIMEZONE = "America/New_York" + + +class KrogerScraper(BaseScraper): + """Scraper for Kroger loyalty purchase history. + + Kroger uses Akamai Bot Manager which aggressively detects headless + browsers. This scraper employs enhanced stealth measures: + - Masks webdriver/automation signals + - Sets realistic browser fingerprint + - Uses human-like interaction pacing + - Preserves browser context across sessions + """ + + async def _create_stealth_context( + self, playwright_instance: Playwright, cookies: list[dict] | None = None + ) -> BrowserContext: + """Create a browser context with enhanced stealth for Akamai evasion.""" + browser = await playwright_instance.chromium.launch( + headless=settings.headless, + args=[ + "--disable-blink-features=AutomationControlled", + "--no-sandbox", + "--disable-dev-shm-usage", + "--disable-infobars", + "--window-size=1920,1080", + ], + ) + context = await browser.new_context( + user_agent=DEFAULT_USER_AGENT, + viewport=DEFAULT_VIEWPORT, # type: ignore[arg-type] + locale=DEFAULT_LOCALE, + timezone_id=DEFAULT_TIMEZONE, + java_script_enabled=True, + bypass_csp=False, + color_scheme="light", + has_touch=False, + ) + + # Enhanced stealth script targeting Akamai Bot Manager detection vectors + await context.add_init_script( + """ + // Mask webdriver flag + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + + // Chrome runtime object + window.chrome = { + runtime: {}, + loadTimes: function() {}, + csi: function() {}, + app: { isInstalled: false } + }; + + // Realistic plugin array + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5] + }); + + // Languages + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'] + }); + + // Platform + Object.defineProperty(navigator, 'platform', { + get: () => 'Win32' + }); + + // Hardware concurrency + Object.defineProperty(navigator, 'hardwareConcurrency', { + get: () => 8 + }); + + // Device memory + Object.defineProperty(navigator, 'deviceMemory', { + get: () => 8 + }); + + // Permissions query override (Akamai checks this) + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => + parameters.name === 'notifications' + ? Promise.resolve({ state: Notification.permission }) + : originalQuery(parameters); + + // WebGL vendor/renderer (avoid "Google Inc." / "ANGLE" tells) + const getParameter = WebGLRenderingContext.prototype.getParameter; + WebGLRenderingContext.prototype.getParameter = function(parameter) { + if (parameter === 37445) return 'Intel Inc.'; + if (parameter === 37446) return 'Intel Iris OpenGL Engine'; + return getParameter.call(this, parameter); + }; + """ + ) + + if cookies: + await context.add_cookies(cookies) # type: ignore[arg-type] + + return cast(BrowserContext, context) + + async def login(self, username: str, password: str) -> SessionData: + """Log in to Kroger and capture session cookies.""" + async with async_playwright() as p: + context = await self._create_stealth_context(p) + page = await context.new_page() + try: + return await self._perform_login(page, context, username, password) + finally: + if context.browser: + await context.browser.close() + + async def _perform_login( + self, page: Page, context: BrowserContext, username: str, password: str + ) -> SessionData: + """Execute the Kroger login flow.""" + logger.info("Navigating to Kroger sign-in page") + await page.goto(KROGER_LOGIN_PAGE, wait_until="networkidle") + await self.human_delay(2000, 4000) + + # Kroger login form — email/username field + email_input = page.locator( + 'input[id="SignIn-emailInput"], ' + 'input[name="email"], ' + 'input[type="email"], ' + 'input[data-testid="SignIn-emailInput"]' + ) + await email_input.wait_for(state="visible", timeout=settings.browser_timeout_ms) + await email_input.click() + await self.human_delay(300, 700) + await email_input.fill(username) + await self.human_delay(800, 1500) + + # Password field + password_input = page.locator( + 'input[id="SignIn-passwordInput"], ' + 'input[name="password"], ' + 'input[type="password"], ' + 'input[data-testid="SignIn-passwordInput"]' + ) + await password_input.wait_for(state="visible", timeout=settings.browser_timeout_ms) + await password_input.click() + await self.human_delay(300, 700) + await password_input.fill(password) + await self.human_delay(1000, 2000) + + # Sign-in button + sign_in_btn = page.locator( + 'button[id="SignIn-submitButton"], ' + 'button[data-testid="SignIn-submitButton"], ' + 'button[type="submit"]:has-text("Sign In")' + ) + await sign_in_btn.click() + + # Wait for redirect away from sign-in page + await page.wait_for_url( + lambda url: "signin" not in url.lower(), + timeout=settings.browser_timeout_ms, + ) + await self.human_delay(1500, 3000) + + # Capture cookies + raw_cookies = await context.cookies() + cookies = [dict(c) for c in raw_cookies] + now = datetime.now(UTC) + + logger.info("Kroger login successful, captured %d cookies", len(cookies)) + return SessionData( + cookies=cookies, + user_agent=DEFAULT_USER_AGENT, + created_at=now, + expires_at=now + timedelta(hours=2), + extra={"retailer": "kroger"}, + ) + + async def check_session(self, session: SessionData) -> bool: + """Check if the Kroger session is still valid.""" + if session.expires_at and datetime.now(UTC) > session.expires_at: + logger.info("Kroger session expired based on timestamp") + return False + + async with async_playwright() as p: + context = await self._create_stealth_context(p, cookies=session.cookies) + page = await context.new_page() + try: + response = await page.goto(KROGER_ACCOUNT_PAGE, wait_until="networkidle") + current_url = page.url.lower() + is_valid = "signin" not in current_url and response is not None and response.ok + logger.info("Kroger session check: valid=%s (url=%s)", is_valid, page.url) + return is_valid + except Exception: + logger.exception("Kroger session check failed") + return False + finally: + if context.browser: + await context.browser.close() + + async def scrape_receipts( + self, session: SessionData, since: datetime | None = None + ) -> list[RawReceipt]: + """Scrape purchase history from Kroger.""" + async with async_playwright() as p: + context = await self._create_stealth_context(p, cookies=session.cookies) + page = await context.new_page() + try: + return await self._fetch_receipts(page, since) + finally: + if context.browser: + await context.browser.close() + + async def _fetch_receipts(self, page: Page, since: datetime | None) -> list[RawReceipt]: + """Fetch receipt list and details from Kroger purchase history.""" + # Navigate to purchase history to establish context + await page.goto(KROGER_PURCHASE_HISTORY, wait_until="networkidle") + await self.human_delay(1500, 3000) + + receipts: list[RawReceipt] = [] + + # Kroger purchase history API endpoint + api_response = await page.request.get(KROGER_RECEIPT_API) + if not api_response.ok: + logger.warning( + "Kroger purchase history request failed: %d %s", + api_response.status, + api_response.status_text, + ) + return [] + + response = await api_response.json() + if not isinstance(response, dict): + logger.warning("Unexpected purchase history response type: %s", type(response)) + return [] + + # Handle Kroger's response structure + orders = response.get("orders", response.get("purchases", [])) + if not isinstance(orders, list): + logger.warning("No orders found in Kroger purchase history response") + return [] + + logger.info("Found %d orders in Kroger purchase history", len(orders)) + + for order in orders: + raw_id = order.get("orderId") or order.get("receiptId") or order.get("id") or "" + order_id = str(raw_id) + purchase_date = order.get( + "purchaseDate", order.get("transactionDate", order.get("date", "")) + ) + + # Filter by date if 'since' is provided + if since and purchase_date: + try: + txn_dt = datetime.fromisoformat(purchase_date.replace("Z", "+00:00")) + if txn_dt < since: + continue + except (ValueError, TypeError): + pass + + if not order_id: + continue + + await self.human_delay(1000, 2500) + + # Fetch receipt detail + detail = await self._fetch_receipt_detail(page, order_id) + + raw_store = ( + order.get("storeNumber") + or order.get("divisionNumber") + or order.get("storeId") + or "" + ) + store_number = str(raw_store) + + receipts.append( + RawReceipt( + receipt_id=order_id, + purchase_date=purchase_date, + store_number=store_number, + raw_data={**order, "detail": detail}, + source_url=f"{KROGER_RECEIPT_DETAIL_API}?orderId={order_id}", + ) + ) + + logger.info("Scraped %d receipts from Kroger", len(receipts)) + return receipts + + async def _fetch_receipt_detail(self, page: Page, order_id: str) -> dict: + """Fetch detailed receipt data for a single Kroger order.""" + try: + url = f"{KROGER_RECEIPT_DETAIL_API}?orderId={order_id}" + api_response = await page.request.get(url) + if not api_response.ok: + logger.warning( + "Kroger receipt detail request failed for %s: %d", + order_id, + api_response.status, + ) + return {} + detail = await api_response.json() + return detail if isinstance(detail, dict) else {} + except Exception: + logger.exception("Failed to fetch Kroger receipt detail for %s", order_id) + return {} + + def parse_receipt(self, raw: RawReceipt) -> dict: + """Parse raw Kroger receipt into structured purchase data.""" + from receiptwitness.parsers.kroger import parse_kroger_receipt + + return parse_kroger_receipt(raw) diff --git a/src/receiptwitness/scrapers/meijer.py b/src/receiptwitness/scrapers/meijer.py new file mode 100644 index 0000000..4a4dd8e --- /dev/null +++ b/src/receiptwitness/scrapers/meijer.py @@ -0,0 +1,301 @@ +"""Meijer mPerks scraper using Playwright. + +Meijer has no public API. We reverse-engineer the XHR endpoints the mPerks +web app uses to pull purchase history and receipt data. The flow: + +1. Launch stealth Playwright browser +2. Navigate to mPerks login page and authenticate +3. Capture session cookies after successful login +4. Use those cookies to hit the mPerks receipt API endpoints directly +5. Parse receipt JSON into structured PurchaseCreate records + +Key endpoints (reverse-engineered from mPerks SPA): +- Login: POST https://www.meijer.com/bin/meijer/account/login +- Receipts: GET https://www.meijer.com/bin/meijer/profile/purchasehistory +- Receipt detail: GET https://www.meijer.com/bin/meijer/profile/receipt?receiptId=... +""" + +import logging +from datetime import UTC, datetime, timedelta +from typing import cast + +from playwright.async_api import BrowserContext, Page, Playwright, async_playwright + +from receiptwitness.config import settings +from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData + +logger = logging.getLogger(__name__) + +# Meijer mPerks URLs +MEIJER_BASE = "https://www.meijer.com" +MEIJER_LOGIN_PAGE = f"{MEIJER_BASE}/shopping/login.html" +MEIJER_LOGIN_API = f"{MEIJER_BASE}/bin/meijer/account/login" +MEIJER_PURCHASE_HISTORY = f"{MEIJER_BASE}/bin/meijer/profile/purchasehistory" +MEIJER_RECEIPT_DETAIL = f"{MEIJER_BASE}/bin/meijer/profile/receipt" +MEIJER_MPERKS_HOME = f"{MEIJER_BASE}/mperks.html" + +# Realistic browser fingerprint +DEFAULT_USER_AGENT = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/131.0.0.0 Safari/537.36" +) +DEFAULT_VIEWPORT = {"width": 1920, "height": 1080} +DEFAULT_LOCALE = "en-US" +DEFAULT_TIMEZONE = "America/Detroit" # Meijer HQ is in Grand Rapids, MI + + +class MeijerScraper(BaseScraper): + """Scraper for Meijer mPerks purchase history.""" + + async def _create_stealth_context( + self, playwright_instance: Playwright, cookies: list[dict] | None = None + ) -> BrowserContext: + """Create a browser context with stealth settings.""" + browser = await playwright_instance.chromium.launch( + headless=settings.headless, + args=[ + "--disable-blink-features=AutomationControlled", + "--no-sandbox", + ], + ) + context = await browser.new_context( + user_agent=DEFAULT_USER_AGENT, + viewport=DEFAULT_VIEWPORT, # type: ignore[arg-type] + locale=DEFAULT_LOCALE, + timezone_id=DEFAULT_TIMEZONE, + java_script_enabled=True, + bypass_csp=False, + ) + # Mask webdriver flag + await context.add_init_script( + """ + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + // Mask chrome automation indicators + window.chrome = { runtime: {} }; + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5] + }); + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'] + }); + """ + ) + if cookies: + await context.add_cookies(cookies) # type: ignore[arg-type] + return cast(BrowserContext, context) + + async def login(self, username: str, password: str) -> SessionData: + """Log in to Meijer mPerks and capture session cookies. + + The mPerks login flow: + 1. Navigate to login page + 2. Fill email and password fields + 3. Click sign-in button + 4. Wait for redirect to mPerks dashboard + 5. Extract session cookies + """ + async with async_playwright() as p: + context = await self._create_stealth_context(p) + page = await context.new_page() + try: + return await self._perform_login(page, context, username, password) + finally: + if context.browser: + await context.browser.close() + + async def _perform_login( + self, page: Page, context: BrowserContext, username: str, password: str + ) -> SessionData: + """Execute the login flow on the mPerks portal.""" + logger.info("Navigating to Meijer login page") + await page.goto(MEIJER_LOGIN_PAGE, wait_until="networkidle") + await self.human_delay(1500, 3000) + + # Fill email field + email_input = page.locator('input[type="email"], input[name="email"], #email') + await email_input.wait_for(state="visible", timeout=settings.browser_timeout_ms) + await email_input.click() + await self.human_delay(200, 500) + await email_input.fill(username) + await self.human_delay(500, 1000) + + # Fill password field + password_input = page.locator('input[type="password"], input[name="password"], #password') + await password_input.wait_for(state="visible", timeout=settings.browser_timeout_ms) + await password_input.click() + await self.human_delay(200, 500) + await password_input.fill(password) + await self.human_delay(500, 1500) + + # Click sign-in button + sign_in_btn = page.locator( + 'button[type="submit"], button:has-text("Sign In"), button:has-text("Log In")' + ) + await sign_in_btn.click() + + # Wait for navigation after login + await page.wait_for_url( + lambda url: "login" not in url.lower(), + timeout=settings.browser_timeout_ms, + ) + await self.human_delay(1000, 2000) + + # Capture cookies + raw_cookies = await context.cookies() + cookies = [dict(c) for c in raw_cookies] + now = datetime.now(UTC) + + logger.info("Meijer login successful, captured %d cookies", len(cookies)) + return SessionData( + cookies=cookies, + user_agent=DEFAULT_USER_AGENT, + created_at=now, + expires_at=now + timedelta(hours=4), + ) + + async def check_session(self, session: SessionData) -> bool: + """Check if the mPerks session is still valid. + + Makes a lightweight request to the mPerks home page and checks + if we get redirected to login (session expired) or not. + """ + if session.expires_at and datetime.now(UTC) > session.expires_at: + logger.info("Meijer session expired based on timestamp") + return False + + async with async_playwright() as p: + context = await self._create_stealth_context(p, cookies=session.cookies) + page = await context.new_page() + try: + response = await page.goto(MEIJER_MPERKS_HOME, wait_until="networkidle") + current_url = page.url.lower() + is_valid = "login" not in current_url and response is not None and response.ok + logger.info("Meijer session check: valid=%s (url=%s)", is_valid, page.url) + return is_valid + except Exception: + logger.exception("Meijer session check failed") + return False + finally: + if context.browser: + await context.browser.close() + + async def scrape_receipts( + self, session: SessionData, since: datetime | None = None + ) -> list[RawReceipt]: + """Scrape purchase history from Meijer mPerks. + + Uses the XHR endpoints the mPerks SPA calls to fetch receipt data. + The purchase history endpoint returns a list of recent transactions, + and we can fetch individual receipt details for line items. + """ + async with async_playwright() as p: + context = await self._create_stealth_context(p, cookies=session.cookies) + page = await context.new_page() + try: + return await self._fetch_receipts(page, since) + finally: + if context.browser: + await context.browser.close() + + async def _fetch_receipts(self, page: Page, since: datetime | None) -> list[RawReceipt]: + """Fetch receipt list and detail via mPerks XHR endpoints. + + Uses Playwright's page.request API (APIRequestContext) instead of + page.evaluate(fetch(...)) for better observability — requests show up + in Playwright traces and can be intercepted by route handlers. + """ + # Navigate to mPerks to establish context (cookies need domain context) + await page.goto(MEIJER_MPERKS_HOME, wait_until="networkidle") + await self.human_delay(1000, 2000) + + receipts: list[RawReceipt] = [] + + # Fetch purchase history listing via page.request (APIRequestContext) + api_response = await page.request.get(MEIJER_PURCHASE_HISTORY) + if not api_response.ok: + logger.warning( + "Purchase history request failed: %d %s", + api_response.status, + api_response.status_text, + ) + return [] + + response = await api_response.json() + + if not isinstance(response, dict): + logger.warning("Unexpected purchase history response type: %s", type(response)) + return [] + + transactions = response.get("transactions", response.get("purchaseHistory", [])) + if not isinstance(transactions, list): + logger.warning("No transactions found in purchase history response") + return [] + + logger.info("Found %d transactions in Meijer purchase history", len(transactions)) + + for txn in transactions: + receipt_id = str(txn.get("transactionId", txn.get("receiptId", ""))) + purchase_date = txn.get("transactionDate", txn.get("purchaseDate", "")) + + # Filter by date if 'since' is provided + if since and purchase_date: + try: + txn_dt = datetime.fromisoformat(purchase_date.replace("Z", "+00:00")) + if txn_dt < since: + continue + except (ValueError, TypeError): + pass + + if not receipt_id: + continue + + await self.human_delay(800, 2000) + + # Fetch receipt detail + detail = await self._fetch_receipt_detail(page, receipt_id) + + receipts.append( + RawReceipt( + receipt_id=receipt_id, + purchase_date=purchase_date, + store_number=str(txn.get("storeNumber", txn.get("storeId", ""))), + raw_data={**txn, "detail": detail}, + source_url=f"{MEIJER_RECEIPT_DETAIL}?receiptId={receipt_id}", + ) + ) + + logger.info("Scraped %d receipts from Meijer", len(receipts)) + return receipts + + async def _fetch_receipt_detail(self, page: Page, receipt_id: str) -> dict: + """Fetch detailed receipt data for a single transaction. + + Uses Playwright's page.request API for traceability. + """ + try: + url = f"{MEIJER_RECEIPT_DETAIL}?receiptId={receipt_id}" + api_response = await page.request.get(url) + if not api_response.ok: + logger.warning( + "Receipt detail request failed for %s: %d", + receipt_id, + api_response.status, + ) + return {} + detail = await api_response.json() + return detail if isinstance(detail, dict) else {} + except Exception: + logger.exception("Failed to fetch receipt detail for %s", receipt_id) + return {} + + def parse_receipt(self, raw: RawReceipt) -> dict: + """Parse raw Meijer receipt into structured purchase data. + + Delegates to the dedicated parser module. + """ + from receiptwitness.parsers.meijer import parse_meijer_receipt + + return parse_meijer_receipt(raw) diff --git a/src/receiptwitness/scrapers/target.py b/src/receiptwitness/scrapers/target.py new file mode 100644 index 0000000..1f959a6 --- /dev/null +++ b/src/receiptwitness/scrapers/target.py @@ -0,0 +1,326 @@ +"""Target Circle scraper using Playwright. + +Target stores ~1 year of in-store purchase history tied to Circle accounts. +Purchases appear when the user pays with a linked card, uses the Target app +wallet, or enters their Circle phone number at checkout. + +Key endpoints (reverse-engineered from target.com SPA): +- Login: POST https://gsp.target.com/gsp/authentications/v1/auth_codes +- Order history: GET https://api.target.com/order_history/v1/orders (in-store tab) +- Receipt detail: GET https://api.target.com/order_history/v1/orders/{orderId} +""" + +import logging +from datetime import UTC, datetime, timedelta +from typing import cast + +from playwright.async_api import BrowserContext, Page, Playwright, async_playwright + +from receiptwitness.config import settings +from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData + +logger = logging.getLogger(__name__) + +# Target endpoints +TARGET_BASE = "https://www.target.com" +TARGET_LOGIN_PAGE = f"{TARGET_BASE}/login" +TARGET_ACCOUNT_PAGE = f"{TARGET_BASE}/account" +TARGET_ORDER_HISTORY = f"{TARGET_BASE}/account/orders" +TARGET_ORDER_API = "https://api.target.com/order_history/v1/orders" +TARGET_RECEIPT_API = "https://api.target.com/order_history/v1/orders" + +# Realistic browser fingerprint — Chrome on Windows +DEFAULT_USER_AGENT = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/131.0.0.0 Safari/537.36" +) +DEFAULT_VIEWPORT = {"width": 1920, "height": 1080} +DEFAULT_LOCALE = "en-US" +DEFAULT_TIMEZONE = "America/Detroit" # SE Michigan coverage + + +class TargetScraper(BaseScraper): + """Scraper for Target Circle in-store purchase history. + + Target's order history SPA loads purchase data from internal API + endpoints. This scraper authenticates via the web login flow, + captures session cookies, and uses those to hit the order history + API for in-store receipt data. + """ + + async def _create_stealth_context( + self, playwright_instance: Playwright, cookies: list[dict] | None = None + ) -> BrowserContext: + """Create a browser context with stealth settings for Target.""" + browser = await playwright_instance.chromium.launch( + headless=settings.headless, + args=[ + "--disable-blink-features=AutomationControlled", + "--no-sandbox", + "--disable-dev-shm-usage", + ], + ) + context = await browser.new_context( + user_agent=DEFAULT_USER_AGENT, + viewport=DEFAULT_VIEWPORT, # type: ignore[arg-type] + locale=DEFAULT_LOCALE, + timezone_id=DEFAULT_TIMEZONE, + java_script_enabled=True, + bypass_csp=False, + color_scheme="light", + has_touch=False, + ) + # Mask webdriver and automation signals + await context.add_init_script( + """ + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + + window.chrome = { + runtime: {}, + loadTimes: function() {}, + csi: function() {}, + app: { isInstalled: false } + }; + + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5] + }); + + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'] + }); + + Object.defineProperty(navigator, 'platform', { + get: () => 'Win32' + }); + + Object.defineProperty(navigator, 'hardwareConcurrency', { + get: () => 8 + }); + + Object.defineProperty(navigator, 'deviceMemory', { + get: () => 8 + }); + """ + ) + if cookies: + await context.add_cookies(cookies) # type: ignore[arg-type] + return cast(BrowserContext, context) + + async def login(self, username: str, password: str) -> SessionData: + """Log in to Target and capture session cookies.""" + async with async_playwright() as p: + context = await self._create_stealth_context(p) + page = await context.new_page() + try: + return await self._perform_login(page, context, username, password) + finally: + if context.browser: + await context.browser.close() + + async def _perform_login( + self, page: Page, context: BrowserContext, username: str, password: str + ) -> SessionData: + """Execute the Target login flow.""" + logger.info("Navigating to Target sign-in page") + await page.goto(TARGET_LOGIN_PAGE, wait_until="networkidle") + await self.human_delay(2000, 4000) + + # Target login form — email/username field + email_input = page.locator( + 'input[id="username"], ' + 'input[name="username"], ' + 'input[type="email"], ' + 'input[data-test="username"]' + ) + await email_input.wait_for(state="visible", timeout=settings.browser_timeout_ms) + await email_input.click() + await self.human_delay(300, 700) + await email_input.fill(username) + await self.human_delay(800, 1500) + + # Password field + password_input = page.locator( + 'input[id="password"], ' + 'input[name="password"], ' + 'input[type="password"], ' + 'input[data-test="password"]' + ) + await password_input.wait_for(state="visible", timeout=settings.browser_timeout_ms) + await password_input.click() + await self.human_delay(300, 700) + await password_input.fill(password) + await self.human_delay(1000, 2000) + + # Sign-in button + sign_in_btn = page.locator( + 'button[id="login"], ' + 'button[data-test="login-button"], ' + 'button[type="submit"]:has-text("Sign in")' + ) + await sign_in_btn.click() + + # Wait for redirect away from login page + await page.wait_for_url( + lambda url: "login" not in url.lower(), + timeout=settings.browser_timeout_ms, + ) + await self.human_delay(1500, 3000) + + # Capture cookies + raw_cookies = await context.cookies() + cookies = [dict(c) for c in raw_cookies] + now = datetime.now(UTC) + + logger.info("Target login successful, captured %d cookies", len(cookies)) + return SessionData( + cookies=cookies, + user_agent=DEFAULT_USER_AGENT, + created_at=now, + expires_at=now + timedelta(hours=2), + extra={"retailer": "target"}, + ) + + async def check_session(self, session: SessionData) -> bool: + """Check if the Target session is still valid.""" + if session.expires_at and datetime.now(UTC) > session.expires_at: + logger.info("Target session expired based on timestamp") + return False + + async with async_playwright() as p: + context = await self._create_stealth_context(p, cookies=session.cookies) + page = await context.new_page() + try: + response = await page.goto(TARGET_ACCOUNT_PAGE, wait_until="networkidle") + current_url = page.url.lower() + is_valid = "login" not in current_url and response is not None and response.ok + logger.info("Target session check: valid=%s (url=%s)", is_valid, page.url) + return is_valid + except Exception: + logger.exception("Target session check failed") + return False + finally: + if context.browser: + await context.browser.close() + + async def scrape_receipts( + self, session: SessionData, since: datetime | None = None + ) -> list[RawReceipt]: + """Scrape in-store purchase history from Target Circle.""" + async with async_playwright() as p: + context = await self._create_stealth_context(p, cookies=session.cookies) + page = await context.new_page() + try: + return await self._fetch_receipts(page, since) + finally: + if context.browser: + await context.browser.close() + + async def _fetch_receipts(self, page: Page, since: datetime | None) -> list[RawReceipt]: + """Fetch receipt list and details from Target order history. + + Target's order history page has separate tabs for online and in-store + purchases. We target the in-store tab which shows Circle-linked + transactions. + """ + # Navigate to order history to establish context + await page.goto(TARGET_ORDER_HISTORY, wait_until="networkidle") + await self.human_delay(1500, 3000) + + receipts: list[RawReceipt] = [] + + # Target order history API — filter for in-store purchases + api_response = await page.request.get( + TARGET_ORDER_API, + params={"channel": "in_store", "limit": "50"}, + ) + if not api_response.ok: + logger.warning( + "Target order history request failed: %d %s", + api_response.status, + api_response.status_text, + ) + return [] + + response = await api_response.json() + if not isinstance(response, dict): + logger.warning("Unexpected order history response type: %s", type(response)) + return [] + + # Target uses "orders" key for in-store purchase list + orders = response.get("orders", response.get("transactions", [])) + if not isinstance(orders, list): + logger.warning("No orders found in Target order history response") + return [] + + logger.info("Found %d in-store orders in Target history", len(orders)) + + for order in orders: + raw_id = order.get("orderId") or order.get("transactionId") or order.get("id") or "" + order_id = str(raw_id) + purchase_date = order.get( + "purchaseDate", + order.get("transactionDate", order.get("date", "")), + ) + + # Filter by date if 'since' is provided + if since and purchase_date: + try: + txn_dt = datetime.fromisoformat(purchase_date.replace("Z", "+00:00")) + if txn_dt < since: + continue + except (ValueError, TypeError): + pass + + if not order_id: + continue + + await self.human_delay(1000, 2500) + + # Fetch receipt detail + detail = await self._fetch_receipt_detail(page, order_id) + + raw_store = ( + order.get("storeNumber") or order.get("storeId") or order.get("locationId") or "" + ) + store_number = str(raw_store) + + receipts.append( + RawReceipt( + receipt_id=order_id, + purchase_date=purchase_date, + store_number=store_number, + raw_data={**order, "detail": detail}, + source_url=f"{TARGET_RECEIPT_API}/{order_id}", + ) + ) + + logger.info("Scraped %d receipts from Target", len(receipts)) + return receipts + + async def _fetch_receipt_detail(self, page: Page, order_id: str) -> dict: + """Fetch detailed receipt data for a single Target order.""" + try: + url = f"{TARGET_RECEIPT_API}/{order_id}" + api_response = await page.request.get(url) + if not api_response.ok: + logger.warning( + "Target receipt detail request failed for %s: %d", + order_id, + api_response.status, + ) + return {} + detail = await api_response.json() + return detail if isinstance(detail, dict) else {} + except Exception: + logger.exception("Failed to fetch Target receipt detail for %s", order_id) + return {} + + def parse_receipt(self, raw: RawReceipt) -> dict: + """Parse raw Target receipt into structured purchase data.""" + from receiptwitness.parsers.target import parse_target_receipt + + return parse_target_receipt(raw) diff --git a/src/receiptwitness/session/__init__.py b/src/receiptwitness/session/__init__.py new file mode 100644 index 0000000..70beaef --- /dev/null +++ b/src/receiptwitness/session/__init__.py @@ -0,0 +1 @@ +"""Session management — encrypted cookie storage and refresh logic.""" diff --git a/src/receiptwitness/session/encryption.py b/src/receiptwitness/session/encryption.py new file mode 100644 index 0000000..b406bcf --- /dev/null +++ b/src/receiptwitness/session/encryption.py @@ -0,0 +1,52 @@ +"""Fernet-based encryption for session cookies at rest. + +Session data (cookies, tokens) is encrypted before writing to the database +and decrypted only when needed for a scrape. The encryption key is provided +via the RW_SESSION_ENCRYPTION_KEY environment variable — it is never stored +in the database or logged. +""" + +import json +import logging + +from cryptography.fernet import Fernet, InvalidToken + +from receiptwitness.config import settings + +logger = logging.getLogger(__name__) + + +def _get_fernet() -> Fernet: + """Get a Fernet instance using the configured encryption key.""" + key = settings.session_encryption_key + if not key: + raise ValueError( + "RW_SESSION_ENCRYPTION_KEY is not set. " + "Generate one with: " + "python -c 'from cryptography.fernet import Fernet; " + "print(Fernet.generate_key().decode())'" + ) + return Fernet(key.encode() if isinstance(key, str) else key) + + +def encrypt_session_data(data: dict) -> str: + """Encrypt session data dict to a Fernet token string. + + The data is JSON-serialized, then encrypted. The result is a + URL-safe base64-encoded string suitable for storing in JSONB. + """ + f = _get_fernet() + plaintext = json.dumps(data, default=str).encode("utf-8") + return f.encrypt(plaintext).decode("utf-8") + + +def decrypt_session_data(encrypted: str) -> dict: + """Decrypt a Fernet token string back to a session data dict.""" + f = _get_fernet() + try: + plaintext = f.decrypt(encrypted.encode("utf-8")) + result: dict = json.loads(plaintext) + return result + except InvalidToken: + logger.error("Failed to decrypt session data — invalid token or wrong key") + raise diff --git a/src/receiptwitness/session/manager.py b/src/receiptwitness/session/manager.py new file mode 100644 index 0000000..205ccbd --- /dev/null +++ b/src/receiptwitness/session/manager.py @@ -0,0 +1,81 @@ +"""Session storage, retrieval, and refresh logic. + +Manages the lifecycle of retailer session data: +- Load encrypted session from DB +- Check validity via scraper +- Re-authenticate if expired +- Save new session back (encrypted) +""" + +import logging +from dataclasses import asdict +from datetime import UTC, datetime + +from receiptwitness.scrapers.base import BaseScraper, SessionData +from receiptwitness.session.encryption import decrypt_session_data, encrypt_session_data + +logger = logging.getLogger(__name__) + + +def session_from_db_record(session_data_encrypted: str | None) -> SessionData | None: + """Deserialize and decrypt a session from the database. + + The session_data column in user_store_accounts stores the Fernet-encrypted + JSON of the SessionData fields. + """ + if not session_data_encrypted: + return None + + try: + data = decrypt_session_data(session_data_encrypted) + return SessionData( + cookies=data["cookies"], + user_agent=data["user_agent"], + created_at=datetime.fromisoformat(data["created_at"]), + expires_at=( + datetime.fromisoformat(data["expires_at"]) if data.get("expires_at") else None + ), + extra=data.get("extra", {}), + ) + except Exception: + logger.exception("Failed to load session from DB record") + return None + + +def session_to_db_value(session: SessionData) -> str: + """Serialize and encrypt a session for database storage.""" + data = asdict(session) + # Convert datetime objects to ISO strings for JSON serialization + data["created_at"] = session.created_at.isoformat() + if session.expires_at: + data["expires_at"] = session.expires_at.isoformat() + return encrypt_session_data(data) + + +async def get_valid_session( + scraper: BaseScraper, + session_data_encrypted: str | None, + username: str, + password: str, +) -> tuple[SessionData, bool]: + """Get a valid session, re-authenticating if needed. + + Returns: + A tuple of (session, was_refreshed). If was_refreshed is True, + the caller should persist the new session to the database. + """ + # Try existing session first + existing = session_from_db_record(session_data_encrypted) + if existing: + if existing.expires_at and datetime.now(UTC) > existing.expires_at: + logger.info("Session expired by timestamp, re-authenticating") + elif await scraper.check_session(existing): + logger.info("Existing session is valid") + return existing, False + else: + logger.info("Session check failed, re-authenticating") + + # Need to re-authenticate + logger.info("Performing fresh login") + new_session = await scraper.login(username, password) + return new_session, True diff --git a/src/receiptwitness/worker/__init__.py b/src/receiptwitness/worker/__init__.py new file mode 100644 index 0000000..e32899a --- /dev/null +++ b/src/receiptwitness/worker/__init__.py @@ -0,0 +1 @@ +"""Async email receipt worker consuming from DragonflyDB Streams.""" diff --git a/src/receiptwitness/worker/email_worker.py b/src/receiptwitness/worker/email_worker.py new file mode 100644 index 0000000..52a5dc0 --- /dev/null +++ b/src/receiptwitness/worker/email_worker.py @@ -0,0 +1,104 @@ +"""Async worker that consumes email receipt jobs from DragonflyDB Streams.""" + +import asyncio +import logging + +from cartsnitch_common.database import get_async_session_factory +from cartsnitch_common.models.user import User +from sqlalchemy import select + +from receiptwitness.config import settings +from receiptwitness.events import publish_receipt_ingested +from receiptwitness.parsers.email.base import BaseEmailParser, EmailReceipt +from receiptwitness.parsers.email.detector import detect_retailer +from receiptwitness.parsers.email.kroger import KrogerEmailParser +from receiptwitness.parsers.email.meijer import MeijerEmailParser +from receiptwitness.parsers.email.target import TargetEmailParser +from receiptwitness.queue.email import ack_email, consume_emails, get_redis + +logger = logging.getLogger(__name__) + +CONSUMER_NAME = "worker-1" + +# Registry of available email parsers +PARSERS: dict[str, BaseEmailParser] = { + "meijer": MeijerEmailParser(), + "kroger": KrogerEmailParser(), + "target": TargetEmailParser(), +} + + +async def resolve_user(token: str) -> str | None: + """Look up user_id from email_inbound_token.""" + session_factory = get_async_session_factory(settings.database_url) + async with session_factory() as session: + result = await session.execute(select(User.id).where(User.email_inbound_token == token)) + row = result.scalar_one_or_none() + return str(row) if row else None + + +async def process_job(msg_id: str, job) -> bool: + """Process a single email job. Returns True on success.""" + # 1. Resolve user from token + user_id = await resolve_user(job.user_id) # user_id field holds token + if not user_id: + logger.warning("Unknown token %s, dropping message %s", job.user_id, msg_id) + return True # ack to avoid infinite retry + + # 2. Build EmailReceipt + email = EmailReceipt( + sender=job.sender, + recipient=job.recipient, + subject=job.subject, + body_html=job.body_html, + body_plain=job.body_plain, + received_at=job.received_at, + ) + + # 3. Detect retailer + retailer = detect_retailer(email) + if not retailer or retailer not in PARSERS: + logger.warning( + "Unrecognized retailer from %s, archiving msg %s", + job.sender, + msg_id, + ) + return True # ack — no parser available + + # 4. Parse + parser = PARSERS[retailer] + parsed = parser.parse(email) + + # 5. Publish event + await publish_receipt_ingested( + user_id=user_id, + store_slug=retailer, + purchase_id=parsed.get("receipt_id", msg_id), + purchase_date=parsed.get("purchase_date", ""), + item_count=len(parsed.get("items", [])), + total=parsed.get("total", 0), + ) + return True + + +async def run_worker() -> None: + """Main worker loop — consume and process email jobs.""" + client = await get_redis() + logger.info("Email worker started, consuming from email:receipts") + while True: + try: + jobs = await consume_emails(client, CONSUMER_NAME, count=5, block_ms=5000) + for msg_id, job in jobs: + try: + success = await process_job(msg_id, job) + if success: + await ack_email(client, msg_id) + except Exception: + logger.exception("Failed to process email job %s", msg_id) + except Exception: + logger.exception("Worker loop error, retrying in 5s") + await asyncio.sleep(5) + + +if __name__ == "__main__": + asyncio.run(run_worker()) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..45a30cf --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,33 @@ +"""Shared test fixtures.""" + +import json +import os +from pathlib import Path + +import pytest + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + +os.environ.setdefault("RW_SESSION_ENCRYPTION_KEY", "test-secret-key-for-unit-tests-only-32bytes!") +os.environ.setdefault("RW_MAILGUN_WEBHOOK_SIGNING_KEY", "test-mailgun-signing-key") + + +@pytest.fixture +def meijer_receipt_data() -> dict: + """Load the sample Meijer receipt fixture.""" + with open(FIXTURES_DIR / "meijer_receipt.json") as f: + return json.load(f) + + +@pytest.fixture +def kroger_receipt_data() -> dict: + """Load the sample Kroger receipt fixture.""" + with open(FIXTURES_DIR / "kroger_receipt.json") as f: + return json.load(f) + + +@pytest.fixture +def target_receipt_data() -> dict: + """Load the sample Target receipt fixture.""" + with open(FIXTURES_DIR / "target_receipt.json") as f: + return json.load(f) diff --git a/tests/fixtures/kroger_email_receipt.html b/tests/fixtures/kroger_email_receipt.html new file mode 100644 index 0000000..9cb33f8 --- /dev/null +++ b/tests/fixtures/kroger_email_receipt.html @@ -0,0 +1,45 @@ + + + + + Kroger Digital Receipt + + +
+ Kroger +

Your Digital Receipt

+

Kroger Plus Member

+
+ +
+

Kroger #882 - Downtown

+

123 Main Street
Anytown, OH 45202

+

Date: 03/15/2026

+

Receipt #: KR-2026-0315-4829

+

Transaction #: TXN-789123456

+
+ +
+

Items Purchased

+

Whole Milk 1 Gallon $3.99

+

Sourdough Bread $4.49

+

Free Range Eggs 12ct $5.99

+

Baby Spinach 5oz $4.29

+
+ +
+

Subtotal: $18.76

+

Tax: $1.24

+

Total: $20.00

+
+ +
+

Kroger Plus Savings: $3.25 saved on this order.

+
+ +
+

Thank you for shopping at Kroger!

+

Keep your receipt for returns within 90 days.

+
+ + \ No newline at end of file diff --git a/tests/fixtures/kroger_receipt.json b/tests/fixtures/kroger_receipt.json new file mode 100644 index 0000000..51c0481 --- /dev/null +++ b/tests/fixtures/kroger_receipt.json @@ -0,0 +1,131 @@ +{ + "orderId": "KR-2026-0312-4471", + "purchaseDate": "2026-03-12T16:45:00Z", + "storeNumber": "00357", + "divisionNumber": "014", + "total": 94.17, + "savings": 15.30, + "detail": { + "receiptId": "KR-2026-0312-4471", + "items": [ + { + "description": "KROGER WHOLE MILK GAL", + "upc": "0001111041700", + "quantity": 1, + "basePrice": 3.99, + "totalPrice": 3.99, + "regularPrice": 4.29, + "salePrice": 3.99, + "couponAmount": 0.0, + "plusCardSavings": 0.30, + "department": "DAIRY" + }, + { + "description": "BANANAS", + "upc": "0000000004011", + "quantity": 1, + "basePrice": 0.59, + "totalPrice": 0.59, + "regularPrice": 0.59, + "salePrice": null, + "couponAmount": null, + "plusCardSavings": null, + "department": "PRODUCE" + }, + { + "description": "SIMPLE TRUTH ORG EGGS 12CT", + "upc": "0001111087840", + "quantity": 2, + "basePrice": 5.49, + "totalPrice": 10.98, + "regularPrice": 5.99, + "salePrice": 5.49, + "couponAmount": 0.0, + "plusCardSavings": 1.00, + "department": "DAIRY" + }, + { + "description": "KROGER DELI TURKEY BREAST", + "upc": null, + "quantity": 0.68, + "basePrice": 9.99, + "totalPrice": 6.79, + "regularPrice": 9.99, + "salePrice": null, + "weight": 0.68, + "weightUom": "LB", + "department": "DELI" + }, + { + "description": "TIDE PODS 42CT", + "upc": "0003700096223", + "quantity": 1, + "basePrice": 13.99, + "totalPrice": 13.99, + "regularPrice": 15.99, + "salePrice": 13.99, + "couponAmount": 2.00, + "plusCardSavings": 0.0, + "department": "HOUSEHOLD" + }, + { + "description": "VOIDED DORITOS NACHO", + "upc": "0002840032505", + "quantity": 1, + "basePrice": 4.79, + "totalPrice": 4.79, + "voided": true, + "department": "SNACKS" + }, + { + "description": "RETURNED GATORADE 8PK", + "upc": "0005200012505", + "quantity": 1, + "basePrice": 7.99, + "totalPrice": 7.99, + "status": "RETURNED", + "department": "BEVERAGES" + }, + { + "description": "KROGER SHARP CHEDDAR 8OZ", + "upc": "0001111060930", + "quantity": 1, + "basePrice": 3.49, + "totalPrice": 3.49, + "regularPrice": 3.49, + "salePrice": null, + "couponAmount": null, + "plusCardSavings": null, + "department": "DAIRY" + }, + { + "description": "PRIVATE SELECTION PASTA", + "upc": "0001111085612", + "quantity": 3, + "basePrice": 2.49, + "totalPrice": 7.47, + "regularPrice": 2.99, + "salePrice": 2.49, + "couponAmount": 0.0, + "plusCardSavings": 1.50, + "department": "GROCERY" + }, + { + "description": "KROGER GROUND BEEF 80/20", + "upc": null, + "quantity": 1.23, + "basePrice": 5.99, + "totalPrice": 7.37, + "regularPrice": 6.99, + "salePrice": 5.99, + "weight": 1.23, + "weightUom": "LB", + "department": "MEAT" + } + ], + "subtotal": 78.47, + "tax": 5.50, + "total": 94.17, + "totalSavings": 15.30 + } +} diff --git a/tests/fixtures/meijer_email_receipt.html b/tests/fixtures/meijer_email_receipt.html new file mode 100644 index 0000000..f61deb3 --- /dev/null +++ b/tests/fixtures/meijer_email_receipt.html @@ -0,0 +1,127 @@ + + + + + + Meijer Digital Receipt + + + +
+
+

MEIJER

+

Digital Receipt

+
+ +
+

Meijer Store #42

+

1555 Lake Drive SE, Grand Rapids, MI 49506

+
+ +
+
+ Date: March 15, 2026
+ Time: 2:34 PM +
+
+ Transaction #
+ TXN-2026-0315-0042 +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ItemQtyPrice
ORGANIC BANANAS1$0.69
WHOLE MILK 1 GAL1$4.29
MEIJER WHOLE GRAIN OAT CEREAL 18OZ1$4.99
FRESH BROCCOLI CROWN1$2.49
GROUND BEEF 85/15 1LB1$6.99
SOURDOUGH BREAD1$3.99
MEIJER BABY SPINACH 5OZ1$4.49
LARGE EGGS DOZEN1$3.29
+ +
+
+ Subtotal + $31.22 +
+
+ Tax + $2.19 +
+
+ Total Savings + -$3.40 +
+
+ Total + $33.41 +
+
+ + +
+ + diff --git a/tests/fixtures/meijer_receipt.json b/tests/fixtures/meijer_receipt.json new file mode 100644 index 0000000..a733215 --- /dev/null +++ b/tests/fixtures/meijer_receipt.json @@ -0,0 +1,85 @@ +{ + "transactionId": "TXN-2026-0310-001", + "transactionDate": "2026-03-10T14:30:00Z", + "storeNumber": "42", + "total": 87.42, + "savings": 12.50, + "detail": { + "receiptId": "TXN-2026-0310-001", + "items": [ + { + "description": "ORGANIC BANANAS", + "upc": "0000000004011", + "quantity": 1, + "price": 0.69, + "extendedPrice": 0.69, + "regularPrice": 0.79, + "salePrice": 0.69, + "couponDiscount": 0.0, + "mperksDiscount": 0.10, + "category": "PRODUCE" + }, + { + "description": "MEIJER 2% MILK GAL", + "upc": "0041250000123", + "quantity": 2, + "price": 3.49, + "extendedPrice": 6.98, + "regularPrice": 3.79, + "salePrice": 3.49, + "couponDiscount": 0.0, + "mperksDiscount": 0.0, + "category": "DAIRY" + }, + { + "description": "CHEERIOS 18OZ", + "upc": "0016000275614", + "quantity": 1, + "price": 4.99, + "extendedPrice": 4.99, + "regularPrice": 5.49, + "salePrice": null, + "couponDiscount": 0.50, + "mperksDiscount": 0.0, + "category": "CEREAL" + }, + { + "description": "WEIGHTED DELI TURKEY", + "upc": null, + "quantity": 0.75, + "price": 8.99, + "extendedPrice": 6.74, + "regularPrice": 8.99, + "salePrice": null, + "couponDiscount": null, + "mperksDiscount": null, + "category": "DELI" + }, + { + "description": "VOIDED SODA 12PK", + "upc": "0004900005678", + "quantity": 1, + "price": 5.99, + "extendedPrice": 5.99, + "voided": true, + "category": "BEVERAGES" + }, + { + "description": "MEIJER PAPER TOWELS 6PK", + "upc": "0041250099001", + "quantity": 1, + "price": 7.99, + "extendedPrice": 7.99, + "regularPrice": 9.99, + "salePrice": 7.99, + "couponDiscount": 1.00, + "mperksDiscount": 1.00, + "category": "HOUSEHOLD" + } + ], + "subtotal": 74.92, + "tax": 5.24, + "total": 87.42, + "totalSavings": 12.50 + } +} diff --git a/tests/fixtures/target_email_receipt.html b/tests/fixtures/target_email_receipt.html new file mode 100644 index 0000000..70f0720 --- /dev/null +++ b/tests/fixtures/target_email_receipt.html @@ -0,0 +1,44 @@ + + + + + Target Order Confirmation + + +
+ Target +

Order Confirmation

+

Thanks for shopping Target Circle!

+
+ +
+

Target Store #1247 - Riverside

+

4500 River Road
Columbus, OH 43220

+

Date: 03/18/2026

+

Order #: TGT-2026-0318-9124

+

Confirmation #: CNF-44772819

+
+ +
+

Items Purchased

+

Good & Gather Whole Milk 1 Gal $3.89

+

Arborio Rice 2lb bag $6.49

+

Parmesan Wedge 8oz $7.99

+
+ +
+

Subtotal: $18.37

+

Tax: $1.45

+

Total: $19.82

+
+ +
+

Target Circle offer saved you $0.30 on this order.

+
+ +
+

Questions? Call Target Guest Services at 1-800-591-3869.

+

Receipt valid for returns within 30 days.

+
+ + \ No newline at end of file diff --git a/tests/fixtures/target_receipt.json b/tests/fixtures/target_receipt.json new file mode 100644 index 0000000..c76bb5b --- /dev/null +++ b/tests/fixtures/target_receipt.json @@ -0,0 +1,140 @@ +{ + "orderId": "TGT-2026-0315-7890", + "purchaseDate": "2026-03-15T11:23:00Z", + "storeNumber": "2774", + "total": 83.21, + "savings": 11.45, + "detail": { + "receiptId": "TGT-2026-0315-7890", + "items": [ + { + "description": "GOOD & GATHER WHOLE MILK GAL", + "tcin": "14767459", + "upc": "0085239100123", + "quantity": 1, + "unitPrice": 3.89, + "totalPrice": 3.89, + "regularPrice": 4.19, + "circlePrice": 3.89, + "couponDiscount": 0.0, + "circleRewardsDiscount": 0.30, + "promoDescription": "Circle offer: Save 30c", + "department": "GROCERY" + }, + { + "description": "BANANAS", + "upc": "0000000004011", + "quantity": 1, + "unitPrice": 0.25, + "totalPrice": 0.25, + "regularPrice": 0.25, + "circlePrice": null, + "couponDiscount": null, + "circleRewardsDiscount": null, + "department": "PRODUCE" + }, + { + "description": "MARKET PANTRY LARGE EGGS 18CT", + "tcin": "13292174", + "upc": "0085239206753", + "quantity": 2, + "unitPrice": 4.99, + "totalPrice": 9.98, + "regularPrice": 5.49, + "circlePrice": 4.99, + "couponDiscount": 0.0, + "circleRewardsDiscount": 1.00, + "promoDescription": "Circle offer: 2 for $10", + "department": "GROCERY" + }, + { + "description": "DELI SLICED TURKEY BREAST", + "upc": null, + "quantity": 0.72, + "unitPrice": 10.99, + "totalPrice": 7.91, + "regularPrice": 10.99, + "weight": 0.72, + "weightUom": "LB", + "department": "DELI" + }, + { + "description": "TIDE PODS 42CT", + "tcin": "76150253", + "upc": "0003700096223", + "quantity": 1, + "unitPrice": 13.49, + "totalPrice": 13.49, + "regularPrice": 15.99, + "circlePrice": 13.49, + "couponDiscount": 2.50, + "circleRewardsDiscount": 0.0, + "promoDescription": "Circle offer + mfr coupon", + "department": "HOUSEHOLD" + }, + { + "description": "UP&UP PAPER TOWELS 6PK", + "tcin": "52493117", + "upc": "0085239401567", + "quantity": 1, + "unitPrice": 8.99, + "totalPrice": 8.99, + "regularPrice": 8.99, + "circlePrice": null, + "couponDiscount": null, + "circleRewardsDiscount": null, + "department": "HOUSEHOLD" + }, + { + "description": "VOIDED COCA-COLA 12PK", + "upc": "0004900002521", + "quantity": 1, + "unitPrice": 7.49, + "totalPrice": 7.49, + "voided": true, + "department": "BEVERAGES" + }, + { + "description": "RETURNED OLAY MOISTURIZER", + "upc": "0007560402118", + "quantity": 1, + "unitPrice": 12.99, + "totalPrice": 12.99, + "status": "RETURNED", + "department": "BEAUTY" + }, + { + "description": "FAVOURITE DAY TRAIL MIX", + "tcin": "83921045", + "dpci": "271-09-0142", + "upc": "0085239700891", + "quantity": 1, + "unitPrice": 5.49, + "totalPrice": 5.49, + "regularPrice": 5.49, + "circlePrice": null, + "couponDiscount": null, + "circleRewardsDiscount": null, + "department": "SNACKS" + }, + { + "description": "BOGO GOOD & GATHER PASTA", + "tcin": "78114326", + "upc": "0085239300456", + "quantity": 2, + "unitPrice": 1.79, + "totalPrice": 1.79, + "regularPrice": 1.79, + "circlePrice": 0.895, + "couponDiscount": 0.0, + "circleRewardsDiscount": 1.79, + "promoDescription": "Buy 1 get 1 free", + "department": "GROCERY" + } + ], + "subtotal": 78.32, + "tax": 4.89, + "total": 83.21, + "totalSavings": 11.45 + } +} diff --git a/tests/test_api/__init__.py b/tests/test_api/__init__.py new file mode 100644 index 0000000..598c2e0 --- /dev/null +++ b/tests/test_api/__init__.py @@ -0,0 +1 @@ +"""Tests for the ReceiptWitness API routes.""" diff --git a/tests/test_api/test_webhook.py b/tests/test_api/test_webhook.py new file mode 100644 index 0000000..164144a --- /dev/null +++ b/tests/test_api/test_webhook.py @@ -0,0 +1,125 @@ +"""Tests for the /inbound/email webhook endpoint.""" + +import hashlib +import hmac +import time +from unittest.mock import AsyncMock, patch + +import pytest +from fastapi.testclient import TestClient + +from receiptwitness.main import app + + +@pytest.fixture +def client(): + return TestClient(app) + + +@pytest.fixture +def mock_redis(): + redis_mock = AsyncMock() + with patch("receiptwitness.api.routes.get_redis", return_value=redis_mock): + enqueue_patcher = patch("receiptwitness.api.routes.enqueue_email", new_callable=AsyncMock) + with enqueue_patcher as mock_enqueue: + yield {"redis": redis_mock, "enqueue": mock_enqueue} + + +def make_signature(signing_key: str, token: str, timestamp: str) -> str: + return hmac.new( + signing_key.encode(), + f"{timestamp}{token}".encode(), + hashlib.sha256, + ).hexdigest() + + +def valid_form(signing_key: str = "test-secret"): + ts = str(int(time.time())) + token = "test-token" + sig = make_signature(signing_key, token, ts) + return { + "token": token, + "timestamp": ts, + "signature": sig, + "sender": "sender@example.com", + "recipient": "receipts+user123@example.com", + "subject": "Your Meijer Receipt", + "body-html": "

Thank you for shopping at Meijer

", + "body-plain": "Thank you for shopping at Meijer", + "Message-Id": "", + } + + +def test_valid_webhook(client, mock_redis): + with patch("receiptwitness.api.routes.settings") as mock_settings: + mock_settings.mailgun_webhook_signing_key = "test-secret" + response = client.post("/inbound/email", data=valid_form()) + assert response.status_code == 200 + assert response.json() == {"status": "queued"} + mock_redis["enqueue"].assert_awaited_once() + + +def test_invalid_signature(client, mock_redis): + with patch("receiptwitness.api.routes.settings") as mock_settings: + mock_settings.mailgun_webhook_signing_key = "test-secret" + form = valid_form() + form["signature"] = "wrong-signature" + response = client.post("/inbound/email", data=form) + assert response.status_code == 406 + assert response.json()["detail"] == "Invalid signature" + mock_redis["enqueue"].assert_not_awaited() + + +def test_invalid_recipient_no_plus(client, mock_redis): + with patch("receiptwitness.api.routes.settings") as mock_settings: + mock_settings.mailgun_webhook_signing_key = "test-secret" + form = valid_form() + form["recipient"] = "receipts@example.com" # no plus-address + response = client.post("/inbound/email", data=form) + assert response.status_code == 406 + assert response.json()["detail"] == "Invalid recipient" + mock_redis["enqueue"].assert_not_awaited() + + +def test_stale_timestamp(client, mock_redis): + with patch("receiptwitness.api.routes.settings") as mock_settings: + mock_settings.mailgun_webhook_signing_key = "test-secret" + ts = str(int(time.time()) - 600) # 10 min old + token = "test-token" + sig = make_signature("test-secret", token, ts) + form = { + "token": token, + "timestamp": ts, + "signature": sig, + "sender": "sender@example.com", + "recipient": "receipts+user123@example.com", + "subject": "Receipt", + } + response = client.post("/inbound/email", data=form) + assert response.status_code == 406 + assert response.json()["detail"] == "Invalid signature" + mock_redis["enqueue"].assert_not_awaited() + + +def test_invalid_timestamp_returns_406(client, mock_redis): + """Empty timestamp should return 406, not 500.""" + with patch("receiptwitness.api.routes.settings") as mock_settings: + mock_settings.mailgun_webhook_signing_key = "test-secret" + form = { + "token": "test-token", + "timestamp": "", + "signature": "any-sig", + "sender": "sender@example.com", + "recipient": "receipts+user123@example.com", + "subject": "Receipt", + } + response = client.post("/inbound/email", data=form) + assert response.status_code == 406 + assert response.json()["detail"] == "Invalid signature" + mock_redis["enqueue"].assert_not_awaited() + + +def test_get_inbound_email_returns_405(client): + """GET /inbound/email is not allowed.""" + response = client.get("/inbound/email") + assert response.status_code == 405 diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..059573b --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,46 @@ +import pytest +from receiptwitness.config import ReceiptWitnessSettings + + +def test_valid_config(): + s = ReceiptWitnessSettings( + session_encryption_key="7reF42nmTwbdN21PBoubGp7h_FU8qSimstmlaMLoRK8=" + ) + assert s.session_encryption_key + + +def test_missing_session_encryption_key_raises(): + with pytest.raises(ValueError, match="RW_SESSION_ENCRYPTION_KEY"): + ReceiptWitnessSettings(session_encryption_key="") + + +def test_placeholder_session_encryption_key_raises(): + with pytest.raises(ValueError, match="RW_SESSION_ENCRYPTION_KEY"): + ReceiptWitnessSettings(session_encryption_key="change-me-in-production") + + +def test_notifications_enabled_without_resend_key_raises(): + with pytest.raises(ValueError, match="RW_RESEND_API_KEY"): + ReceiptWitnessSettings( + session_encryption_key="7reF42nmTwbdN21PBoubGp7h_FU8qSimstmlaMLoRK8=", + notifications_enabled=True, + resend_api_key="", + ) + + +def test_notifications_disabled_without_resend_key_ok(): + s = ReceiptWitnessSettings( + session_encryption_key="7reF42nmTwbdN21PBoubGp7h_FU8qSimstmlaMLoRK8=", + notifications_enabled=False, + resend_api_key="", + ) + assert s.notifications_enabled is False + + +def test_notifications_enabled_with_resend_key_ok(): + s = ReceiptWitnessSettings( + session_encryption_key="7reF42nmTwbdN21PBoubGp7h_FU8qSimstmlaMLoRK8=", + notifications_enabled=True, + resend_api_key="re_test_1234567890", + ) + assert s.resend_api_key == "re_test_1234567890" diff --git a/tests/test_notifications/__init__.py b/tests/test_notifications/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_notifications/test_email.py b/tests/test_notifications/test_email.py new file mode 100644 index 0000000..e8970e9 --- /dev/null +++ b/tests/test_notifications/test_email.py @@ -0,0 +1,84 @@ +"""Tests for email notifications.""" + +from unittest.mock import patch + +import pytest + + +class TestSendReceiptNotification: + @pytest.fixture + def mock_resend(self): + with patch("receiptwitness.notifications.email.resend") as mock: + yield mock + + @pytest.mark.asyncio + async def test_sends_email_with_correct_params(self, mock_resend): + from receiptwitness.notifications.email import send_receipt_notification + + with ( + patch("receiptwitness.notifications.email.settings") as mock_settings, + patch( + "receiptwitness.notifications.email.asyncio.to_thread", + new=lambda fn, *args, **kwargs: fn(*args, **kwargs), + ), + ): + mock_settings.notifications_enabled = True + mock_settings.resend_api_key = "re_testkey_123" + mock_settings.notification_email_from = "noreply@test.com" + + await send_receipt_notification( + user_email="user@example.com", + store_name="Meijer", + item_count=5, + total=42.99, + purchase_date="2026-03-28", + ) + + mock_resend.Emails.send.assert_called_once_with( + { + "from": "noreply@test.com", + "to": ["user@example.com"], + "subject": "Receipt processed: Meijer - $42.99", + "html": ( + "

Your receipt from Meijer on " + "2026-03-28 has been processed.

" + "

5 items, total: $42.99

" + ), + } + ) + + @pytest.mark.asyncio + async def test_skips_when_disabled(self, mock_resend): + from receiptwitness.notifications.email import send_receipt_notification + + with patch("receiptwitness.notifications.email.settings") as mock_settings: + mock_settings.notifications_enabled = False + mock_settings.resend_api_key = "re_testkey_123" + + await send_receipt_notification( + user_email="user@example.com", + store_name="Meijer", + item_count=5, + total=42.99, + purchase_date="2026-03-28", + ) + + mock_resend.Emails.send.assert_not_called() + + @pytest.mark.asyncio + async def test_skips_when_api_key_empty(self, mock_resend): + from receiptwitness.notifications.email import send_receipt_notification + + with patch("receiptwitness.notifications.email.settings") as mock_settings: + mock_settings.notifications_enabled = True + mock_settings.resend_api_key = "" + + await send_receipt_notification( + user_email="user@example.com", + store_name="Meijer", + item_count=5, + total=42.99, + purchase_date="2026-03-28", + ) + + mock_resend.Emails.send.assert_not_called() diff --git a/tests/test_parsers/__init__.py b/tests/test_parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_parsers/test_email/__init__.py b/tests/test_parsers/test_email/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_parsers/test_email/test_detector.py b/tests/test_parsers/test_email/test_detector.py new file mode 100644 index 0000000..87a5aac --- /dev/null +++ b/tests/test_parsers/test_email/test_detector.py @@ -0,0 +1,49 @@ +"""Tests for retailer detector.""" + +from receiptwitness.parsers.email.base import EmailReceipt +from receiptwitness.parsers.email.detector import detect_retailer + + +def test_detect_meijer(): + email = EmailReceipt( + sender="receipts@meijer.com", + recipient="user@example.com", + subject="Your Receipt", + ) + assert detect_retailer(email) == "meijer" + + +def test_detect_kroger(): + email = EmailReceipt( + sender="noreply@email.kroger.com", + recipient="user@example.com", + subject="Your Receipt", + ) + assert detect_retailer(email) == "kroger" + + +def test_detect_target(): + email = EmailReceipt( + sender="Target ", + recipient="user@example.com", + subject="Your Receipt", + ) + assert detect_retailer(email) == "target" + + +def test_detect_unknown(): + email = EmailReceipt( + sender="noreply@walmart.com", + recipient="user@example.com", + subject="Your Receipt", + ) + assert detect_retailer(email) is None + + +def test_detect_case_insensitive(): + email = EmailReceipt( + sender="Receipts@MEIJER.COM", + recipient="user@example.com", + subject="Your Receipt", + ) + assert detect_retailer(email) == "meijer" diff --git a/tests/test_parsers/test_email/test_kroger_email_parser.py b/tests/test_parsers/test_email/test_kroger_email_parser.py new file mode 100644 index 0000000..ab30257 --- /dev/null +++ b/tests/test_parsers/test_email/test_kroger_email_parser.py @@ -0,0 +1,93 @@ +"""Tests for KrogerEmailParser.""" + +from pathlib import Path + +from receiptwitness.parsers.email.base import EmailReceipt +from receiptwitness.parsers.email.kroger import KrogerEmailParser + +FIXTURE_PATH = Path(__file__).parent.parent.parent / "fixtures" / "kroger_email_receipt.html" + + +class TestKrogerEmailParser: + """Tests for KrogerEmailParser.""" + + def setup_method(self) -> None: + self.parser = KrogerEmailParser() + self.fixture_html = FIXTURE_PATH.read_text() + + def test_can_parse_kroger_sender(self) -> None: + email = EmailReceipt( + sender="noreply@email.kroger.com", + recipient="user@example.com", + subject="Your Kroger Receipt", + body_html=self.fixture_html, + ) + assert self.parser.can_parse(email) is True + + def test_can_parse_kroger_in_body(self) -> None: + email = EmailReceipt( + sender="someone@unknown.com", + recipient="user@example.com", + subject="Your Receipt", + body_html="Kroger digital receipt", + ) + assert self.parser.can_parse(email) is True + + def test_cannot_parse_unrelated(self) -> None: + email = EmailReceipt( + sender="noreply@walmart.com", + recipient="user@example.com", + subject="Your Receipt", + body_html="Walmart receipt", + ) + assert self.parser.can_parse(email) is False + + def test_parse_items(self) -> None: + email = EmailReceipt( + sender="noreply@kroger.com", + recipient="user@example.com", + subject="Your Kroger Receipt", + body_html=self.fixture_html, + ) + result = self.parser.parse(email) + items = result.get("items", []) + assert len(items) >= 3 + product_names = [item["product_name_raw"] for item in items] + assert any("Whole Milk" in name for name in product_names) + assert any("Sourdough" in name for name in product_names) + for item in items: + assert "unit_price" in item + assert "extended_price" in item + + def test_parse_totals(self) -> None: + email = EmailReceipt( + sender="noreply@kroger.com", + recipient="user@example.com", + subject="Your Kroger Receipt", + body_html=self.fixture_html, + ) + result = self.parser.parse(email) + total = result.get("total", 0) + assert total > 0 + + def test_parse_receipt_id(self) -> None: + email = EmailReceipt( + sender="noreply@kroger.com", + recipient="user@example.com", + subject="Your Kroger Receipt", + body_html=self.fixture_html, + ) + result = self.parser.parse(email) + receipt_id = result.get("receipt_id", "") + assert "KR-2026" in receipt_id or "TXN" in receipt_id + + def test_parse_date(self) -> None: + email = EmailReceipt( + sender="noreply@kroger.com", + recipient="user@example.com", + subject="Your Kroger Receipt", + body_html=self.fixture_html, + ) + result = self.parser.parse(email) + purchase_date = result.get("purchase_date", "") + assert purchase_date == "2026-03-15" diff --git a/tests/test_parsers/test_email/test_meijer_parser.py b/tests/test_parsers/test_email/test_meijer_parser.py new file mode 100644 index 0000000..3c33976 --- /dev/null +++ b/tests/test_parsers/test_email/test_meijer_parser.py @@ -0,0 +1,182 @@ +"""Tests for the Meijer email receipt parser.""" + +import os +from decimal import Decimal + +import pytest + +from receiptwitness.parsers.email.base import EmailReceipt +from receiptwitness.parsers.email.meijer import MeijerEmailParser + +FIXTURE_PATH = os.path.join( + os.path.dirname(__file__), "..", "..", "fixtures", "meijer_email_receipt.html" +) + + +def load_fixture() -> str: + with open(FIXTURE_PATH) as f: + return f.read() + + +@pytest.fixture +def meijer_email() -> EmailReceipt: + html = load_fixture() + return EmailReceipt( + sender="Meijer Receipts ", + recipient="shopper@example.com", + subject="Your Meijer Receipt — Transaction #TXN-2026-0315-0042", + body_html=html, + body_plain=None, + received_at="2026-03-15T14:34:00Z", + ) + + +@pytest.fixture +def kroger_email() -> EmailReceipt: + return EmailReceipt( + sender="Kroger ", + recipient="shopper@example.com", + subject="Your Kroger Receipt", + body_html="Kroger receipt", + ) + + +class TestCanParse: + def test_can_parse_meijer(self, meijer_email: EmailReceipt): + parser = MeijerEmailParser() + assert parser.can_parse(meijer_email) is True + + def test_cannot_parse_kroger(self, kroger_email: EmailReceipt): + parser = MeijerEmailParser() + assert parser.can_parse(kroger_email) is False + + def test_can_parse_meijer_plain_sender(self): + email = EmailReceipt( + sender="receipts@meijer.com", + recipient="shopper@example.com", + subject="Receipt", + body_html="", + ) + parser = MeijerEmailParser() + assert parser.can_parse(email) is True + + def test_cannot_parse_non_meijer(self): + email = EmailReceipt( + sender=" Target ", + recipient="shopper@example.com", + subject="Target Receipt", + body_html="", + ) + parser = MeijerEmailParser() + assert parser.can_parse(email) is False + + +class TestParseMeijerReceipt: + def test_receipt_id_extracted(self, meijer_email: EmailReceipt): + parser = MeijerEmailParser() + result = parser.parse(meijer_email) + assert result["receipt_id"] == "TXN-2026-0315-0042" + + def test_purchase_date_extracted(self, meijer_email: EmailReceipt): + parser = MeijerEmailParser() + result = parser.parse(meijer_email) + assert result["purchase_date"] == "2026-03-15" + + def test_items_extracted(self, meijer_email: EmailReceipt): + parser = MeijerEmailParser() + result = parser.parse(meijer_email) + items = result["items"] + assert len(items) == 8 + + names = [item["product_name_raw"] for item in items] + assert "ORGANIC BANANAS" in names + assert "WHOLE MILK 1 GAL" in names + assert "GROUND BEEF 85/15 1LB" in names + + def test_item_quantities(self, meijer_email: EmailReceipt): + parser = MeijerEmailParser() + result = parser.parse(meijer_email) + # Find ORGANIC BANANAS + bananas = next(i for i in result["items"] if "BANANAS" in i["product_name_raw"]) + assert bananas["quantity"] == Decimal("1") + + def test_item_prices(self, meijer_email: EmailReceipt): + parser = MeijerEmailParser() + result = parser.parse(meijer_email) + # Find ORGANIC BANANAS + bananas = next(i for i in result["items"] if "BANANAS" in i["product_name_raw"]) + assert bananas["unit_price"] == Decimal("0.69") + assert bananas["extended_price"] == Decimal("0.69") + + def test_totals(self, meijer_email: EmailReceipt): + parser = MeijerEmailParser() + result = parser.parse(meijer_email) + assert result["total"] == Decimal("33.41") + assert result["subtotal"] == Decimal("31.22") + assert result["tax"] == Decimal("2.19") + assert result["savings_total"] == Decimal("3.40") + + +class TestParseHandlesMissingFields: + def test_missing_body_html_falls_back_to_plain(self): + email = EmailReceipt( + sender="receipts@email.meijer.com", + recipient="shopper@example.com", + subject="Your Meijer Receipt", + body_html=None, + body_plain="TXN-1234 | March 15, 2026 | Total: $10.00", + ) + parser = MeijerEmailParser() + result = parser.parse(email) + # Should not raise, returns minimal result + assert result["receipt_id"] == "" + assert result["purchase_date"] == "2026-03-15" + assert result["total"] == Decimal("10.00") + + def test_empty_email(self): + email = EmailReceipt( + sender="receipts@email.meijer.com", + recipient="shopper@example.com", + subject="Receipt", + body_html="", + body_plain="", + ) + parser = MeijerEmailParser() + result = parser.parse(email) + assert result["receipt_id"] == "" + assert result["purchase_date"] == "" + assert result["total"] == Decimal("0") + assert result["items"] == [] + + def test_missing_subject_date_from_body(self): + html = """ + + +

Thank you for shopping on April 1, 2026

+

Total: $15.00

+ + + """ + email = EmailReceipt( + sender="receipts@email.meijer.com", + recipient="shopper@example.com", + subject=None, + body_html=html, + ) + parser = MeijerEmailParser() + result = parser.parse(email) + assert result["purchase_date"] == "2026-04-01" + + def test_missing_totals_defaults_to_zero(self): + html = "

Just an email with no totals

" + email = EmailReceipt( + sender="receipts@email.meijer.com", + recipient="shopper@example.com", + subject="Receipt", + body_html=html, + ) + parser = MeijerEmailParser() + result = parser.parse(email) + assert result["total"] == Decimal("0") + assert result["subtotal"] is None + assert result["tax"] is None diff --git a/tests/test_parsers/test_email/test_target_email_parser.py b/tests/test_parsers/test_email/test_target_email_parser.py new file mode 100644 index 0000000..ffa33db --- /dev/null +++ b/tests/test_parsers/test_email/test_target_email_parser.py @@ -0,0 +1,93 @@ +"""Tests for TargetEmailParser.""" + +from pathlib import Path + +from receiptwitness.parsers.email.base import EmailReceipt +from receiptwitness.parsers.email.target import TargetEmailParser + +FIXTURE_PATH = Path(__file__).parent.parent.parent / "fixtures" / "target_email_receipt.html" + + +class TestTargetEmailParser: + """Tests for TargetEmailParser.""" + + def setup_method(self) -> None: + self.parser = TargetEmailParser() + self.fixture_html = FIXTURE_PATH.read_text() + + def test_can_parse_target_sender(self) -> None: + email = EmailReceipt( + sender="receipts@target.com", + recipient="user@example.com", + subject="Your Target Order Confirmation", + body_html=self.fixture_html, + ) + assert self.parser.can_parse(email) is True + + def test_can_parse_circle_in_body(self) -> None: + email = EmailReceipt( + sender="someone@unknown.com", + recipient="user@example.com", + subject="Your Receipt", + body_html="Target Circle savings offer", + ) + assert self.parser.can_parse(email) is True + + def test_cannot_parse_unrelated(self) -> None: + email = EmailReceipt( + sender="noreply@walmart.com", + recipient="user@example.com", + subject="Your Receipt", + body_html="Walmart receipt", + ) + assert self.parser.can_parse(email) is False + + def test_parse_items(self) -> None: + email = EmailReceipt( + sender="orders@target.com", + recipient="user@example.com", + subject="Your Target Order", + body_html=self.fixture_html, + ) + result = self.parser.parse(email) + items = result.get("items", []) + assert len(items) >= 3 + product_names = [item["product_name_raw"] for item in items] + assert any("Whole Milk" in name for name in product_names) + assert any("Arborio" in name for name in product_names) + for item in items: + assert "unit_price" in item + assert "extended_price" in item + + def test_parse_totals(self) -> None: + email = EmailReceipt( + sender="orders@target.com", + recipient="user@example.com", + subject="Your Target Order", + body_html=self.fixture_html, + ) + result = self.parser.parse(email) + total = result.get("total", 0) + assert total > 0 + + def test_parse_receipt_id(self) -> None: + email = EmailReceipt( + sender="orders@target.com", + recipient="user@example.com", + subject="Your Target Order", + body_html=self.fixture_html, + ) + result = self.parser.parse(email) + receipt_id = result.get("receipt_id", "") + assert "TGT-2026" in receipt_id or "CNF" in receipt_id + + def test_parse_date(self) -> None: + email = EmailReceipt( + sender="orders@target.com", + recipient="user@example.com", + subject="Your Target Order", + body_html=self.fixture_html, + ) + result = self.parser.parse(email) + purchase_date = result.get("purchase_date", "") + assert purchase_date == "2026-03-18" diff --git a/tests/test_parsers/test_kroger_parser.py b/tests/test_parsers/test_kroger_parser.py new file mode 100644 index 0000000..001d205 --- /dev/null +++ b/tests/test_parsers/test_kroger_parser.py @@ -0,0 +1,399 @@ +"""Tests for the Kroger receipt parser.""" + +from decimal import Decimal + +from receiptwitness.parsers.kroger import _parse_item, _to_decimal, parse_kroger_receipt +from receiptwitness.scrapers.base import RawReceipt + + +class TestToDecimal: + def test_from_int(self): + assert _to_decimal(42) == Decimal("42") + + def test_from_float(self): + assert _to_decimal(3.99) == Decimal("3.99") + + def test_from_string(self): + assert _to_decimal("7.49") == Decimal("7.49") + + def test_none_returns_default(self): + assert _to_decimal(None) == Decimal("0") + + def test_none_custom_default(self): + assert _to_decimal(None, "1") == Decimal("1") + + def test_invalid_string_returns_default(self): + assert _to_decimal("not-a-number") == Decimal("0") + + def test_empty_string_returns_default(self): + assert _to_decimal("") == Decimal("0") + + +class TestParseItem: + def test_standard_item(self): + raw = { + "description": "KROGER WHOLE MILK GAL", + "upc": "0001111041700", + "quantity": 1, + "basePrice": 3.99, + "totalPrice": 3.99, + "regularPrice": 4.29, + "salePrice": 3.99, + "couponAmount": 0.0, + "plusCardSavings": 0.30, + "department": "DAIRY", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "KROGER WHOLE MILK GAL" + assert result["upc"] == "1111041700" + assert result["quantity"] == Decimal("1") + assert result["unit_price"] == Decimal("3.99") + assert result["extended_price"] == Decimal("3.99") + assert result["regular_price"] == Decimal("4.29") + assert result["sale_price"] == Decimal("3.99") + assert result["loyalty_discount"] == Decimal("0.30") + assert result["category_raw"] == "DAIRY" + + def test_weighted_item(self): + raw = { + "description": "KROGER DELI TURKEY BREAST", + "quantity": 0.68, + "basePrice": 9.99, + "totalPrice": 6.79, + "weight": 0.68, + "weightUom": "LB", + "department": "DELI", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "KROGER DELI TURKEY BREAST" + assert result["upc"] is None + assert result["quantity"] == Decimal("0.68") + assert result["unit_price"] == Decimal("9.99") + assert result["extended_price"] == Decimal("6.79") + + def test_missing_extended_price_computed(self): + raw = { + "description": "TEST ITEM", + "quantity": 3, + "basePrice": 2.49, + } + result = _parse_item(raw) + assert result["extended_price"] == Decimal("2.49") * Decimal("3") + + def test_item_with_coupon(self): + raw = { + "description": "TIDE PODS 42CT", + "upc": "0003700096223", + "quantity": 1, + "basePrice": 13.99, + "totalPrice": 13.99, + "couponAmount": 2.00, + } + result = _parse_item(raw) + assert result["coupon_discount"] == Decimal("2.00") + + def test_missing_description_fallback(self): + raw = {"basePrice": 1.00, "totalPrice": 1.00} + result = _parse_item(raw) + assert result["product_name_raw"] == "UNKNOWN ITEM" + + def test_alternative_field_names_product_name(self): + raw = { + "productName": "ALT NAME ITEM", + "unitPrice": 5.00, + "extendedAmount": 5.00, + "qty": 1, + "krogerProductId": "123456789", + "category": "GROCERY", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "ALT NAME ITEM" + assert result["unit_price"] == Decimal("5.00") + assert result["extended_price"] == Decimal("5.00") + assert result["upc"] == "123456789" + assert result["category_raw"] == "GROCERY" + + def test_item_description_field_name(self): + raw = { + "itemDescription": "ITEM DESC FIELD", + "price": 3.00, + "lineTotal": 3.00, + } + result = _parse_item(raw) + assert result["product_name_raw"] == "ITEM DESC FIELD" + assert result["unit_price"] == Decimal("3.00") + assert result["extended_price"] == Decimal("3.00") + + def test_null_optional_fields(self): + raw = { + "description": "BANANAS", + "upc": "0000000004011", + "quantity": 1, + "basePrice": 0.59, + "totalPrice": 0.59, + "salePrice": None, + "couponAmount": None, + "plusCardSavings": None, + } + result = _parse_item(raw) + assert result["sale_price"] is None + assert result["coupon_discount"] is None + assert result["loyalty_discount"] is None + + def test_upc_leading_zeros_stripped(self): + raw = { + "description": "TEST", + "upc": "0000000004011", + "basePrice": 1.00, + "totalPrice": 1.00, + } + result = _parse_item(raw) + assert result["upc"] == "4011" + + def test_upc_from_kroger_product_id(self): + raw = { + "description": "TEST", + "krogerProductId": "987654321", + "basePrice": 1.00, + "totalPrice": 1.00, + } + result = _parse_item(raw) + assert result["upc"] == "987654321" + + def test_description_whitespace_stripped(self): + raw = { + "description": " EXTRA SPACES ", + "basePrice": 1.00, + "totalPrice": 1.00, + } + result = _parse_item(raw) + assert result["product_name_raw"] == "EXTRA SPACES" + + def test_promo_price_field(self): + raw = { + "description": "PROMO ITEM", + "promoPrice": 2.99, + "originalPrice": 4.99, + "basePrice": 2.99, + "totalPrice": 2.99, + } + result = _parse_item(raw) + assert result["sale_price"] == Decimal("2.99") + assert result["regular_price"] == Decimal("4.99") + + def test_loyalty_discount_from_fuel_points(self): + raw = { + "description": "FUEL DISC ITEM", + "fuelPointsDiscount": 0.50, + "basePrice": 3.00, + "totalPrice": 3.00, + } + result = _parse_item(raw) + assert result["loyalty_discount"] == Decimal("0.50") + + def test_multi_quantity_item(self): + raw = { + "description": "PRIVATE SELECTION PASTA", + "quantity": 3, + "basePrice": 2.49, + "totalPrice": 7.47, + "department": "GROCERY", + } + result = _parse_item(raw) + assert result["quantity"] == Decimal("3") + assert result["unit_price"] == Decimal("2.49") + assert result["extended_price"] == Decimal("7.47") + + def test_aisle_as_category(self): + raw = { + "description": "AISLE ITEM", + "aisle": "FROZEN FOODS", + "basePrice": 4.00, + "totalPrice": 4.00, + } + result = _parse_item(raw) + assert result["category_raw"] == "FROZEN FOODS" + + +class TestParseKrogerReceipt: + def test_full_receipt(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + store_number="00357", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + + assert result["receipt_id"] == "KR-2026-0312-4471" + assert result["purchase_date"] == "2026-03-12T16:45:00Z" + assert result["total"] == Decimal("94.17") + assert result["subtotal"] == Decimal("78.47") + assert result["tax"] == Decimal("5.50") + assert result["savings_total"] == Decimal("15.30") + + # Should have 8 items (voided + returned items excluded) + assert len(result["items"]) == 8 + + # Verify first item + milk = result["items"][0] + assert milk["product_name_raw"] == "KROGER WHOLE MILK GAL" + assert milk["upc"] == "1111041700" + + def test_voided_items_excluded(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + + item_names = [i["product_name_raw"] for i in result["items"]] + assert "VOIDED DORITOS NACHO" not in item_names + + def test_returned_items_excluded(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + + item_names = [i["product_name_raw"] for i in result["items"]] + assert "RETURNED GATORADE 8PK" not in item_names + + def test_return_flag_items_excluded(self): + data = { + "detail": { + "items": [ + { + "description": "NORMAL ITEM", + "basePrice": 5.00, + "totalPrice": 5.00, + }, + { + "description": "RETURNED VIA FLAG", + "basePrice": 3.00, + "totalPrice": 3.00, + "returnFlag": True, + }, + { + "description": "IS RETURN ITEM", + "basePrice": 2.00, + "totalPrice": 2.00, + "isReturn": True, + }, + ], + "total": 5.00, + } + } + raw = RawReceipt( + receipt_id="RET-001", + purchase_date="2026-03-12", + raw_data=data, + ) + result = parse_kroger_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "NORMAL ITEM" + + def test_empty_receipt(self): + raw = RawReceipt( + receipt_id="EMPTY-001", + purchase_date="2026-03-12", + raw_data={"detail": {"items": [], "total": 0}}, + ) + result = parse_kroger_receipt(raw) + assert result["items"] == [] + assert result["total"] == Decimal("0") + + def test_receipt_with_no_detail(self): + raw = RawReceipt( + receipt_id="NO-DETAIL-001", + purchase_date="2026-03-12", + raw_data={"total": 50.00}, + ) + result = parse_kroger_receipt(raw) + assert result["items"] == [] + assert result["total"] == Decimal("50.00") + + def test_raw_data_preserved(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + assert result["raw_data"] is kroger_receipt_data + + def test_alternative_total_field_names(self): + raw = RawReceipt( + receipt_id="ALT-001", + purchase_date="2026-03-12", + raw_data={ + "orderTotal": 42.00, + "subTotal": 35.00, + "salesTax": 3.50, + "youSaved": 5.00, + "detail": {"items": []}, + }, + ) + result = parse_kroger_receipt(raw) + assert result["total"] == Decimal("42.00") + assert result["subtotal"] == Decimal("35.00") + assert result["tax"] == Decimal("3.50") + assert result["savings_total"] == Decimal("5.00") + + def test_receipt_items_alternative_key(self): + data = { + "detail": { + "receiptItems": [ + { + "description": "ALT KEY ITEM", + "basePrice": 3.00, + "totalPrice": 3.00, + } + ], + "total": 3.00, + } + } + raw = RawReceipt( + receipt_id="ALT-KEY-001", + purchase_date="2026-03-12", + raw_data=data, + ) + result = parse_kroger_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "ALT KEY ITEM" + + def test_source_url_preserved(self): + raw = RawReceipt( + receipt_id="URL-001", + purchase_date="2026-03-12", + raw_data={"detail": {"items": [], "total": 0}}, + source_url="https://www.kroger.com/atlas/v1/receipt/api?orderId=URL-001", + ) + result = parse_kroger_receipt(raw) + assert result["source_url"] == "https://www.kroger.com/atlas/v1/receipt/api?orderId=URL-001" + + def test_weighted_items_in_full_receipt(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + + # Find the weighted turkey item + turkey = next(i for i in result["items"] if "TURKEY" in i["product_name_raw"]) + assert turkey["quantity"] == Decimal("0.68") + assert turkey["unit_price"] == Decimal("9.99") + assert turkey["extended_price"] == Decimal("6.79") + + def test_grand_total_field(self): + raw = RawReceipt( + receipt_id="GT-001", + purchase_date="2026-03-12", + raw_data={"grandTotal": 99.99, "detail": {"items": []}}, + ) + result = parse_kroger_receipt(raw) + assert result["total"] == Decimal("99.99") diff --git a/tests/test_parsers/test_meijer_parser.py b/tests/test_parsers/test_meijer_parser.py new file mode 100644 index 0000000..47a5fa9 --- /dev/null +++ b/tests/test_parsers/test_meijer_parser.py @@ -0,0 +1,174 @@ +"""Tests for the Meijer receipt parser.""" + +from decimal import Decimal + +from receiptwitness.parsers.meijer import _parse_item, _to_decimal, parse_meijer_receipt +from receiptwitness.scrapers.base import RawReceipt + + +class TestToDecimal: + def test_from_int(self): + assert _to_decimal(42) == Decimal("42") + + def test_from_float(self): + assert _to_decimal(3.49) == Decimal("3.49") + + def test_from_string(self): + assert _to_decimal("7.99") == Decimal("7.99") + + def test_none_returns_default(self): + assert _to_decimal(None) == Decimal("0") + + def test_none_custom_default(self): + assert _to_decimal(None, "1") == Decimal("1") + + def test_invalid_string_returns_default(self): + assert _to_decimal("not-a-number") == Decimal("0") + + +class TestParseItem: + def test_standard_item(self): + raw = { + "description": "ORGANIC BANANAS", + "upc": "0000000004011", + "quantity": 1, + "price": 0.69, + "extendedPrice": 0.69, + "regularPrice": 0.79, + "salePrice": 0.69, + "couponDiscount": 0.0, + "mperksDiscount": 0.10, + "category": "PRODUCE", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "ORGANIC BANANAS" + assert result["upc"] == "4011" + assert result["quantity"] == Decimal("1") + assert result["unit_price"] == Decimal("0.69") + assert result["extended_price"] == Decimal("0.69") + assert result["regular_price"] == Decimal("0.79") + assert result["sale_price"] == Decimal("0.69") + assert result["loyalty_discount"] == Decimal("0.10") + assert result["category_raw"] == "PRODUCE" + + def test_weighted_item(self): + raw = { + "description": "WEIGHTED DELI TURKEY", + "quantity": 0.75, + "price": 8.99, + "extendedPrice": 6.74, + "category": "DELI", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "WEIGHTED DELI TURKEY" + assert result["upc"] is None + assert result["quantity"] == Decimal("0.75") + assert result["unit_price"] == Decimal("8.99") + assert result["extended_price"] == Decimal("6.74") + + def test_missing_extended_price_computed(self): + raw = { + "description": "TEST ITEM", + "quantity": 3, + "price": 2.50, + } + result = _parse_item(raw) + assert result["extended_price"] == Decimal("2.50") * Decimal("3") + + def test_item_with_coupon_discount(self): + raw = { + "description": "CHEERIOS 18OZ", + "upc": "0016000275614", + "quantity": 1, + "price": 4.99, + "extendedPrice": 4.99, + "couponDiscount": 0.50, + } + result = _parse_item(raw) + assert result["coupon_discount"] == Decimal("0.50") + + def test_missing_description_fallback(self): + raw = {"price": 1.00, "extendedPrice": 1.00} + result = _parse_item(raw) + assert result["product_name_raw"] == "UNKNOWN ITEM" + + def test_alternative_field_names(self): + raw = { + "itemDescription": "ALT NAME ITEM", + "unitPrice": 5.00, + "totalPrice": 5.00, + "qty": 1, + "UPC": "123456789", + "departmentDescription": "GROCERY", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "ALT NAME ITEM" + assert result["unit_price"] == Decimal("5.00") + assert result["upc"] == "123456789" + assert result["category_raw"] == "GROCERY" + + +class TestParseMeijerReceipt: + def test_full_receipt(self, meijer_receipt_data): + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + store_number="42", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + + assert result["receipt_id"] == "TXN-2026-0310-001" + assert result["purchase_date"] == "2026-03-10T14:30:00Z" + assert result["total"] == Decimal("87.42") + assert result["subtotal"] == Decimal("74.92") + assert result["tax"] == Decimal("5.24") + assert result["savings_total"] == Decimal("12.50") + + # Should have 5 items (voided item excluded) + assert len(result["items"]) == 5 + + # Verify first item + bananas = result["items"][0] + assert bananas["product_name_raw"] == "ORGANIC BANANAS" + assert bananas["upc"] == "4011" + + def test_voided_items_excluded(self, meijer_receipt_data): + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + + item_names = [i["product_name_raw"] for i in result["items"]] + assert "VOIDED SODA 12PK" not in item_names + + def test_empty_receipt(self): + raw = RawReceipt( + receipt_id="EMPTY-001", + purchase_date="2026-03-10", + raw_data={"detail": {"items": [], "total": 0}}, + ) + result = parse_meijer_receipt(raw) + assert result["items"] == [] + assert result["total"] == Decimal("0") + + def test_receipt_with_no_detail(self): + raw = RawReceipt( + receipt_id="NO-DETAIL-001", + purchase_date="2026-03-10", + raw_data={"total": 50.00}, + ) + result = parse_meijer_receipt(raw) + assert result["items"] == [] + assert result["total"] == Decimal("50.00") + + def test_raw_data_preserved(self, meijer_receipt_data): + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + assert result["raw_data"] is meijer_receipt_data diff --git a/tests/test_parsers/test_target_parser.py b/tests/test_parsers/test_target_parser.py new file mode 100644 index 0000000..8f197ac --- /dev/null +++ b/tests/test_parsers/test_target_parser.py @@ -0,0 +1,471 @@ +"""Tests for the Target receipt parser.""" + +from decimal import Decimal + +from receiptwitness.parsers.target import _parse_item, _to_decimal, parse_target_receipt +from receiptwitness.scrapers.base import RawReceipt + + +class TestToDecimal: + def test_from_int(self): + assert _to_decimal(42) == Decimal("42") + + def test_from_float(self): + assert _to_decimal(3.89) == Decimal("3.89") + + def test_from_string(self): + assert _to_decimal("8.99") == Decimal("8.99") + + def test_none_returns_default(self): + assert _to_decimal(None) == Decimal("0") + + def test_none_custom_default(self): + assert _to_decimal(None, "1") == Decimal("1") + + def test_invalid_string_returns_default(self): + assert _to_decimal("not-a-number") == Decimal("0") + + def test_empty_string_returns_default(self): + assert _to_decimal("") == Decimal("0") + + +class TestParseItem: + def test_standard_item(self): + raw = { + "description": "GOOD & GATHER WHOLE MILK GAL", + "tcin": "14767459", + "upc": "0085239100123", + "quantity": 1, + "unitPrice": 3.89, + "totalPrice": 3.89, + "regularPrice": 4.19, + "circlePrice": 3.89, + "couponDiscount": 0.0, + "circleRewardsDiscount": 0.30, + "department": "GROCERY", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "GOOD & GATHER WHOLE MILK GAL" + assert result["upc"] == "85239100123" + assert result["quantity"] == Decimal("1") + assert result["unit_price"] == Decimal("3.89") + assert result["extended_price"] == Decimal("3.89") + assert result["regular_price"] == Decimal("4.19") + assert result["sale_price"] == Decimal("3.89") + assert result["loyalty_discount"] == Decimal("0.30") + assert result["category_raw"] == "GROCERY" + + def test_weighted_item(self): + raw = { + "description": "DELI SLICED TURKEY BREAST", + "quantity": 0.72, + "unitPrice": 10.99, + "totalPrice": 7.91, + "weight": 0.72, + "weightUom": "LB", + "department": "DELI", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "DELI SLICED TURKEY BREAST" + assert result["upc"] is None + assert result["quantity"] == Decimal("0.72") + assert result["unit_price"] == Decimal("10.99") + assert result["extended_price"] == Decimal("7.91") + + def test_missing_extended_price_computed(self): + raw = { + "description": "TEST ITEM", + "quantity": 3, + "unitPrice": 2.49, + } + result = _parse_item(raw) + assert result["extended_price"] == Decimal("2.49") * Decimal("3") + + def test_item_with_coupon(self): + raw = { + "description": "TIDE PODS 42CT", + "upc": "0003700096223", + "quantity": 1, + "unitPrice": 13.49, + "totalPrice": 13.49, + "couponDiscount": 2.50, + } + result = _parse_item(raw) + assert result["coupon_discount"] == Decimal("2.50") + + def test_missing_description_fallback(self): + raw = {"unitPrice": 1.00, "totalPrice": 1.00} + result = _parse_item(raw) + assert result["product_name_raw"] == "UNKNOWN ITEM" + + def test_alternative_field_names(self): + raw = { + "productName": "ALT NAME ITEM", + "price": 5.00, + "extendedPrice": 5.00, + "qty": 1, + "UPC": "123456789", + "category": "FROZEN", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "ALT NAME ITEM" + assert result["unit_price"] == Decimal("5.00") + assert result["extended_price"] == Decimal("5.00") + assert result["upc"] == "123456789" + assert result["category_raw"] == "FROZEN" + + def test_item_description_field_name(self): + raw = { + "itemDescription": "ITEM DESC FIELD", + "price": 3.00, + "lineTotal": 3.00, + } + result = _parse_item(raw) + assert result["product_name_raw"] == "ITEM DESC FIELD" + assert result["unit_price"] == Decimal("3.00") + assert result["extended_price"] == Decimal("3.00") + + def test_null_optional_fields(self): + raw = { + "description": "BANANAS", + "upc": "0000000004011", + "quantity": 1, + "unitPrice": 0.25, + "totalPrice": 0.25, + "circlePrice": None, + "couponDiscount": None, + "circleRewardsDiscount": None, + } + result = _parse_item(raw) + assert result["sale_price"] is None + assert result["coupon_discount"] is None + assert result["loyalty_discount"] is None + + def test_upc_leading_zeros_stripped(self): + raw = { + "description": "TEST", + "upc": "0000000004011", + "unitPrice": 1.00, + "totalPrice": 1.00, + } + result = _parse_item(raw) + assert result["upc"] == "4011" + + def test_description_whitespace_stripped(self): + raw = { + "description": " EXTRA SPACES ", + "unitPrice": 1.00, + "totalPrice": 1.00, + } + result = _parse_item(raw) + assert result["product_name_raw"] == "EXTRA SPACES" + + def test_circle_price_preferred_over_sale_price(self): + raw = { + "description": "CIRCLE ITEM", + "circlePrice": 2.99, + "salePrice": 3.49, + "unitPrice": 2.99, + "totalPrice": 2.99, + } + result = _parse_item(raw) + assert result["sale_price"] == Decimal("2.99") + + def test_sale_price_fallback_when_no_circle_price(self): + raw = { + "description": "SALE ITEM", + "salePrice": 3.49, + "unitPrice": 3.49, + "totalPrice": 3.49, + } + result = _parse_item(raw) + assert result["sale_price"] == Decimal("3.49") + + def test_circle_rewards_discount(self): + raw = { + "description": "CIRCLE REWARDS ITEM", + "circleRewardsDiscount": 1.50, + "unitPrice": 5.00, + "totalPrice": 5.00, + } + result = _parse_item(raw) + assert result["loyalty_discount"] == Decimal("1.50") + + def test_circle_discount_fallback(self): + raw = { + "description": "CIRCLE DISC ITEM", + "circleDiscount": 0.75, + "unitPrice": 3.00, + "totalPrice": 3.00, + } + result = _parse_item(raw) + assert result["loyalty_discount"] == Decimal("0.75") + + def test_bogo_item(self): + raw = { + "description": "BOGO GOOD & GATHER PASTA", + "upc": "0085239300456", + "quantity": 2, + "unitPrice": 1.79, + "totalPrice": 1.79, + "regularPrice": 1.79, + "circlePrice": 0.895, + "circleRewardsDiscount": 1.79, + "promoDescription": "Buy 1 get 1 free", + "department": "GROCERY", + } + result = _parse_item(raw) + assert result["quantity"] == Decimal("2") + assert result["unit_price"] == Decimal("1.79") + assert result["extended_price"] == Decimal("1.79") + assert result["sale_price"] == Decimal("0.895") + assert result["loyalty_discount"] == Decimal("1.79") + + def test_multi_quantity_item(self): + raw = { + "description": "MARKET PANTRY EGGS", + "quantity": 2, + "unitPrice": 4.99, + "totalPrice": 9.98, + "department": "GROCERY", + } + result = _parse_item(raw) + assert result["quantity"] == Decimal("2") + assert result["unit_price"] == Decimal("4.99") + assert result["extended_price"] == Decimal("9.98") + + def test_coupon_savings_field(self): + raw = { + "description": "COUPON ITEM", + "couponSavings": 1.00, + "unitPrice": 5.00, + "totalPrice": 5.00, + } + result = _parse_item(raw) + assert result["coupon_discount"] == Decimal("1.00") + + +class TestParseTargetReceipt: + def test_full_receipt(self, target_receipt_data): + raw = RawReceipt( + receipt_id="TGT-2026-0315-7890", + purchase_date="2026-03-15T11:23:00Z", + store_number="2774", + raw_data=target_receipt_data, + ) + result = parse_target_receipt(raw) + + assert result["receipt_id"] == "TGT-2026-0315-7890" + assert result["purchase_date"] == "2026-03-15T11:23:00Z" + assert result["total"] == Decimal("83.21") + assert result["subtotal"] == Decimal("78.32") + assert result["tax"] == Decimal("4.89") + assert result["savings_total"] == Decimal("11.45") + + # Should have 8 items (voided + returned items excluded) + assert len(result["items"]) == 8 + + # Verify first item + milk = result["items"][0] + assert milk["product_name_raw"] == "GOOD & GATHER WHOLE MILK GAL" + assert milk["upc"] == "85239100123" + + def test_voided_items_excluded(self, target_receipt_data): + raw = RawReceipt( + receipt_id="TGT-2026-0315-7890", + purchase_date="2026-03-15", + raw_data=target_receipt_data, + ) + result = parse_target_receipt(raw) + + item_names = [i["product_name_raw"] for i in result["items"]] + assert "VOIDED COCA-COLA 12PK" not in item_names + + def test_returned_items_excluded(self, target_receipt_data): + raw = RawReceipt( + receipt_id="TGT-2026-0315-7890", + purchase_date="2026-03-15", + raw_data=target_receipt_data, + ) + result = parse_target_receipt(raw) + + item_names = [i["product_name_raw"] for i in result["items"]] + assert "RETURNED OLAY MOISTURIZER" not in item_names + + def test_return_flag_items_excluded(self): + data = { + "detail": { + "items": [ + { + "description": "NORMAL ITEM", + "unitPrice": 5.00, + "totalPrice": 5.00, + }, + { + "description": "RETURNED VIA FLAG", + "unitPrice": 3.00, + "totalPrice": 3.00, + "returnFlag": True, + }, + { + "description": "IS RETURN ITEM", + "unitPrice": 2.00, + "totalPrice": 2.00, + "isReturn": True, + }, + ], + "total": 5.00, + } + } + raw = RawReceipt( + receipt_id="RET-001", + purchase_date="2026-03-15", + raw_data=data, + ) + result = parse_target_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "NORMAL ITEM" + + def test_cancelled_items_excluded(self): + data = { + "detail": { + "items": [ + { + "description": "NORMAL ITEM", + "unitPrice": 5.00, + "totalPrice": 5.00, + }, + { + "description": "CANCELLED ITEM", + "unitPrice": 3.00, + "totalPrice": 3.00, + "status": "CANCELLED", + }, + ], + "total": 5.00, + } + } + raw = RawReceipt( + receipt_id="CAN-001", + purchase_date="2026-03-15", + raw_data=data, + ) + result = parse_target_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "NORMAL ITEM" + + def test_empty_receipt(self): + raw = RawReceipt( + receipt_id="EMPTY-001", + purchase_date="2026-03-15", + raw_data={"detail": {"items": [], "total": 0}}, + ) + result = parse_target_receipt(raw) + assert result["items"] == [] + assert result["total"] == Decimal("0") + + def test_receipt_with_no_detail(self): + raw = RawReceipt( + receipt_id="NO-DETAIL-001", + purchase_date="2026-03-15", + raw_data={"total": 50.00}, + ) + result = parse_target_receipt(raw) + assert result["items"] == [] + assert result["total"] == Decimal("50.00") + + def test_raw_data_preserved(self, target_receipt_data): + raw = RawReceipt( + receipt_id="TGT-2026-0315-7890", + purchase_date="2026-03-15", + raw_data=target_receipt_data, + ) + result = parse_target_receipt(raw) + assert result["raw_data"] is target_receipt_data + + def test_alternative_total_field_names(self): + raw = RawReceipt( + receipt_id="ALT-001", + purchase_date="2026-03-15", + raw_data={ + "orderTotal": 42.00, + "subTotal": 35.00, + "salesTax": 3.50, + "circleSavings": 5.00, + "detail": {"items": []}, + }, + ) + result = parse_target_receipt(raw) + assert result["total"] == Decimal("42.00") + assert result["subtotal"] == Decimal("35.00") + assert result["tax"] == Decimal("3.50") + assert result["savings_total"] == Decimal("5.00") + + def test_receipt_items_alternative_key(self): + data = { + "detail": { + "lineItems": [ + { + "description": "ALT KEY ITEM", + "unitPrice": 3.00, + "totalPrice": 3.00, + } + ], + "total": 3.00, + } + } + raw = RawReceipt( + receipt_id="ALT-KEY-001", + purchase_date="2026-03-15", + raw_data=data, + ) + result = parse_target_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "ALT KEY ITEM" + + def test_source_url_preserved(self): + raw = RawReceipt( + receipt_id="URL-001", + purchase_date="2026-03-15", + raw_data={"detail": {"items": [], "total": 0}}, + source_url="https://api.target.com/order_history/v1/orders/URL-001", + ) + result = parse_target_receipt(raw) + assert result["source_url"] == "https://api.target.com/order_history/v1/orders/URL-001" + + def test_weighted_items_in_full_receipt(self, target_receipt_data): + raw = RawReceipt( + receipt_id="TGT-2026-0315-7890", + purchase_date="2026-03-15", + raw_data=target_receipt_data, + ) + result = parse_target_receipt(raw) + + # Find the weighted turkey item + turkey = next(i for i in result["items"] if "TURKEY" in i["product_name_raw"]) + assert turkey["quantity"] == Decimal("0.72") + assert turkey["unit_price"] == Decimal("10.99") + assert turkey["extended_price"] == Decimal("7.91") + + def test_bogo_items_in_full_receipt(self, target_receipt_data): + raw = RawReceipt( + receipt_id="TGT-2026-0315-7890", + purchase_date="2026-03-15", + raw_data=target_receipt_data, + ) + result = parse_target_receipt(raw) + + # Find the BOGO pasta item + pasta = next(i for i in result["items"] if "BOGO" in i["product_name_raw"]) + assert pasta["quantity"] == Decimal("2") + assert pasta["extended_price"] == Decimal("1.79") + assert pasta["loyalty_discount"] == Decimal("1.79") + + def test_grand_total_field(self): + raw = RawReceipt( + receipt_id="GT-001", + purchase_date="2026-03-15", + raw_data={"grandTotal": 99.99, "detail": {"items": []}}, + ) + result = parse_target_receipt(raw) + assert result["total"] == Decimal("99.99") diff --git a/tests/test_pipeline/__init__.py b/tests/test_pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_pipeline/conftest.py b/tests/test_pipeline/conftest.py new file mode 100644 index 0000000..693366f --- /dev/null +++ b/tests/test_pipeline/conftest.py @@ -0,0 +1,23 @@ +"""Shared test fixtures for pipeline tests.""" + +import pytest +from cartsnitch_common.models.base import Base +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + + +@pytest.fixture +def engine(): + """In-memory SQLite engine for unit tests.""" + eng = create_engine("sqlite:///:memory:") + Base.metadata.create_all(eng) + yield eng + eng.dispose() + + +@pytest.fixture +def session(engine): + """SQLAlchemy session bound to in-memory SQLite.""" + factory = sessionmaker(bind=engine) + with factory() as sess: + yield sess diff --git a/tests/test_pipeline/test_matching.py b/tests/test_pipeline/test_matching.py new file mode 100644 index 0000000..408153c --- /dev/null +++ b/tests/test_pipeline/test_matching.py @@ -0,0 +1,161 @@ +"""Tests for product matching & dedup pipeline.""" + +import uuid +from datetime import UTC, datetime +from decimal import Decimal + +from cartsnitch_common.constants import MatchConfidence +from cartsnitch_common.models.product import NormalizedProduct +from cartsnitch_common.schemas.purchase import PurchaseItemCreate + +from receiptwitness.pipeline.matching import ( + ProductMatcher, + classify_confidence, + match_purchase_item, +) +from receiptwitness.pipeline.normalization import MatchMethod + + +class TestClassifyConfidence: + def test_upc_always_high(self): + assert classify_confidence(1.0, MatchMethod.UPC) == MatchConfidence.HIGH + assert classify_confidence(0.5, MatchMethod.UPC) == MatchConfidence.HIGH + + def test_name_high(self): + assert classify_confidence(0.9, MatchMethod.NAME) == MatchConfidence.HIGH + assert classify_confidence(0.8, MatchMethod.NAME) == MatchConfidence.HIGH + + def test_name_medium(self): + assert classify_confidence(0.6, MatchMethod.NAME) == MatchConfidence.MEDIUM + assert classify_confidence(0.5, MatchMethod.NAME) == MatchConfidence.MEDIUM + + def test_name_low(self): + assert classify_confidence(0.3, MatchMethod.NAME) == MatchConfidence.LOW + assert classify_confidence(0.0, MatchMethod.NAME) == MatchConfidence.LOW + + +class TestProductMatcher: + def _make_item(self, name: str, upc: str | None = None) -> PurchaseItemCreate: + return PurchaseItemCreate( + product_name_raw=name, + upc=upc, + unit_price=Decimal("3.99"), + extended_price=Decimal("3.99"), + ) + + def test_match_by_upc(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Whole Milk Gallon", + upc_variants=["041250000001"], + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + + matcher = ProductMatcher(session) + item = self._make_item("Kroger Milk", upc="041250000001") + prod, result, confidence = matcher.match_single(item) + + assert prod is not None + assert prod.id == product.id + assert result is not None + assert result.method == MatchMethod.UPC + assert confidence == MatchConfidence.HIGH + + def test_match_by_name(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Whole Milk Gallon", + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + + matcher = ProductMatcher(session, name_threshold=0.3) + item = self._make_item("Whole Milk Gallon Size") + prod, result, confidence = matcher.match_single(item) + + assert prod is not None + assert result is not None + assert result.method == MatchMethod.NAME + + def test_auto_create_when_no_match(self, session): + matcher = ProductMatcher(session, auto_create=True) + item = self._make_item("Unique Product XYZ 16 oz") + prod, result, confidence = matcher.match_single(item) + + assert prod is not None + assert result is None # No match found, was created + assert confidence == MatchConfidence.LOW + assert prod.canonical_name == "Unique Product XYZ 16 oz" + assert prod.size == "16" + assert prod.size_unit == "oz" + + def test_no_create_when_disabled(self, session): + matcher = ProductMatcher(session, auto_create=False) + item = self._make_item("Nonexistent Product") + prod, result, confidence = matcher.match_single(item) + + assert prod is None + assert result is None + + def test_batch_match(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Large Eggs 12 Count", + upc_variants=["012345"], + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + + matcher = ProductMatcher(session) + items = [ + self._make_item("Large Eggs", upc="012345"), + self._make_item("Brand New Never Seen Product"), + ] + outcomes = matcher.match_items(items) + + assert len(outcomes) == 2 + assert outcomes[0].match is not None + assert outcomes[0].confidence_level == MatchConfidence.HIGH + assert outcomes[0].created_new is False + assert outcomes[1].match is None + assert outcomes[1].created_new is True + + +class TestMatchPurchaseItem: + def test_convenience_function(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Ground Beef 80/20", + upc_variants=["999888"], + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + + item = PurchaseItemCreate( + product_name_raw="Ground Beef", + upc="999888", + unit_price=Decimal("5.99"), + extended_price=Decimal("5.99"), + ) + prod, confidence = match_purchase_item(session, item) + assert prod is not None + assert confidence == MatchConfidence.HIGH + + def test_auto_create_default(self, session): + item = PurchaseItemCreate( + product_name_raw="Totally New Item", + unit_price=Decimal("1.00"), + extended_price=Decimal("1.00"), + ) + prod, confidence = match_purchase_item(session, item) + assert prod is not None + assert confidence == MatchConfidence.LOW diff --git a/tests/test_pipeline/test_normalization.py b/tests/test_pipeline/test_normalization.py new file mode 100644 index 0000000..de1d566 --- /dev/null +++ b/tests/test_pipeline/test_normalization.py @@ -0,0 +1,158 @@ +"""Tests for product normalization module.""" + +import uuid +from datetime import UTC, datetime + +from cartsnitch_common.models.product import NormalizedProduct + +from receiptwitness.pipeline.normalization import ( + MatchMethod, + clean_name, + extract_size_info, + jaccard_similarity, + match_by_name, + match_by_upc, + normalize_product, +) + + +class TestCleanName: + def test_lowercase(self): + assert clean_name("Kroger WHOLE MILK") == "kroger whole milk" + + def test_removes_size_info(self): + assert "oz" not in clean_name("Milk 16 oz Whole") + + def test_removes_noise_words(self): + cleaned = clean_name("The Original Brand Milk") + assert "the" not in cleaned.split() + assert "original" not in cleaned.split() + assert "brand" not in cleaned.split() + + def test_collapses_whitespace(self): + assert " " not in clean_name("Milk Whole Gallon") + + def test_removes_punctuation(self): + cleaned = clean_name("Meijer's Best (Organic) Milk!") + assert "'" not in cleaned + assert "(" not in cleaned + + +class TestExtractSizeInfo: + def test_extracts_oz(self): + result = extract_size_info("Cereal 18 oz box") + assert result == ("18", "oz") + + def test_extracts_fl_oz(self): + result = extract_size_info("Juice 64 fl oz") + assert result == ("64", "fl_oz") + + def test_extracts_lb(self): + result = extract_size_info("Ground Beef 1.5 lb") + assert result == ("1.5", "lb") + + def test_extracts_ct(self): + result = extract_size_info("Eggs Large 12 ct") + assert result == ("12", "ct") + + def test_no_size_returns_none(self): + assert extract_size_info("Bananas") is None + + +class TestJaccardSimilarity: + def test_identical_strings(self): + assert jaccard_similarity("whole milk gallon", "whole milk gallon") == 1.0 + + def test_completely_different(self): + assert jaccard_similarity("apple juice", "ground beef") == 0.0 + + def test_partial_overlap(self): + score = jaccard_similarity("kroger whole milk", "meijer whole milk") + assert 0.4 < score < 0.8 # "whole" and "milk" overlap + + def test_empty_strings(self): + assert jaccard_similarity("", "") == 0.0 + assert jaccard_similarity("milk", "") == 0.0 + + +class TestMatchByUPC: + def test_match_found(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Whole Milk, Gallon", + upc_variants=["0041250000001", "0041250000002"], + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + # SQLite doesn't support JSONB containment — this will raise + # In production (PostgreSQL), this would work + result = match_by_upc(session, "0041250000001") + assert result is not None + assert result.method == MatchMethod.UPC + assert result.confidence == 1.0 + + def test_no_match(self, session): + result = match_by_upc(session, "9999999999999") + assert result is None + + +class TestMatchByName: + def test_exact_name_match(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Whole Milk, Gallon", + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + result = match_by_name(session, "Whole Milk Gallon") + assert result is not None + assert result.method == MatchMethod.NAME + assert result.confidence > 0.5 + + def test_fuzzy_match(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Kroger Whole Milk, 1 Gallon", + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + result = match_by_name(session, "Meijer Whole Milk 1 Gallon", threshold=0.3) + assert result is not None + assert result.confidence > 0.3 + + def test_no_match_below_threshold(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Ground Beef 80/20", + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + result = match_by_name(session, "Apple Juice 64 oz", threshold=0.5) + assert result is None + + +class TestNormalizeProduct: + def test_name_fallback(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Large Eggs, 12 count", + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + result = normalize_product(session, "Large Eggs 12 ct", upc=None) + assert result is not None + assert result.method == MatchMethod.NAME + + def test_no_match(self, session): + result = normalize_product(session, "Nonexistent Product XYZ", upc=None) + assert result is None diff --git a/tests/test_pipeline/test_receipt.py b/tests/test_pipeline/test_receipt.py new file mode 100644 index 0000000..8210713 --- /dev/null +++ b/tests/test_pipeline/test_receipt.py @@ -0,0 +1,204 @@ +"""Tests for receipt normalization pipeline.""" + +import uuid +from datetime import date +from decimal import Decimal + +from receiptwitness.pipeline.receipt import ( + _clean_product_name, + _safe_decimal, + normalize_receipt, + parse_meijer_item, +) + + +class TestCleanProductName: + def test_strips_whitespace(self): + assert _clean_product_name(" Milk ") == "Milk" + + def test_removes_leading_punctuation(self): + assert _clean_product_name("---Milk---") == "Milk" + + def test_collapses_internal_whitespace(self): + assert _clean_product_name("Whole Milk Gallon") == "Whole Milk Gallon" + + def test_empty_string(self): + assert _clean_product_name("") == "" + + +class TestSafeDecimal: + def test_string_input(self): + assert _safe_decimal("3.99") == Decimal("3.99") + + def test_float_input(self): + assert _safe_decimal(3.99) == Decimal("3.99") + + def test_int_input(self): + assert _safe_decimal(4) == Decimal("4") + + def test_none_returns_default(self): + assert _safe_decimal(None) == Decimal("0") + + def test_none_custom_default(self): + assert _safe_decimal(None, Decimal("1")) == Decimal("1") + + def test_invalid_returns_default(self): + assert _safe_decimal("not-a-number") == Decimal("0") + + def test_decimal_passthrough(self): + assert _safe_decimal(Decimal("5.50")) == Decimal("5.50") + + +class TestParseMeijerItem: + def test_basic_item(self): + raw = { + "description": "Kroger Whole Milk 1 Gallon", + "upc": "0041250000001", + "quantity": 1, + "unitPrice": "3.99", + "extendedPrice": "3.99", + "category": "DAIRY", + } + item = parse_meijer_item(raw) + assert item.product_name_raw == "Kroger Whole Milk 1 Gallon" + assert item.upc == "41250000001" # leading zeros stripped + assert item.quantity == Decimal("1") + assert item.unit_price == Decimal("3.99") + assert item.extended_price == Decimal("3.99") + assert item.category_raw == "DAIRY" + + def test_alternate_field_names(self): + raw = { + "name": "Eggs Large 12 ct", + "upcCode": "012345", + "qty": 2, + "price": "4.50", + "totalPrice": "9.00", + "department": "EGGS", + } + item = parse_meijer_item(raw) + assert item.product_name_raw == "Eggs Large 12 ct" + assert item.upc == "12345" + assert item.quantity == Decimal("2") + assert item.unit_price == Decimal("4.50") + assert item.extended_price == Decimal("9.00") + assert item.category_raw == "EGGS" + + def test_calculates_extended_from_unit_price(self): + raw = { + "description": "Bananas", + "unitPrice": "0.59", + "quantity": 3, + } + item = parse_meijer_item(raw) + assert item.extended_price == Decimal("1.77") + + def test_discounts_parsed(self): + raw = { + "description": "Cereal", + "unitPrice": "4.99", + "extendedPrice": "4.99", + "regularPrice": "5.99", + "salePrice": "4.99", + "couponAmount": "1.00", + "loyaltyAmount": "0.50", + } + item = parse_meijer_item(raw) + assert item.regular_price == Decimal("5.99") + assert item.sale_price == Decimal("4.99") + assert item.coupon_discount == Decimal("1.00") + assert item.loyalty_discount == Decimal("0.50") + + def test_alternate_discount_names(self): + raw = { + "description": "Bread", + "unitPrice": "2.99", + "extendedPrice": "2.99", + "couponDiscount": "0.75", + "loyaltyDiscount": "0.25", + } + item = parse_meijer_item(raw) + assert item.coupon_discount == Decimal("0.75") + assert item.loyalty_discount == Decimal("0.25") + + def test_missing_fields_default_gracefully(self): + raw = {"description": "Mystery Item"} + item = parse_meijer_item(raw) + assert item.product_name_raw == "Mystery Item" + assert item.upc is None + assert item.quantity == Decimal("1") + assert item.unit_price == Decimal("0") + assert item.regular_price is None + assert item.category_raw is None + + def test_no_upc_returns_none(self): + raw = {"description": "Loose Bananas", "unitPrice": "1.00", "extendedPrice": "1.00"} + item = parse_meijer_item(raw) + assert item.upc is None + + +class TestNormalizeReceipt: + def test_full_receipt(self): + user_id = str(uuid.uuid4()) + store_id = str(uuid.uuid4()) + raw = { + "receiptId": "REC-001", + "date": "2026-03-15", + "total": "25.47", + "subtotal": "23.00", + "tax": "2.47", + "savings": "3.00", + "items": [ + {"description": "Milk", "unitPrice": "3.99", "extendedPrice": "3.99"}, + {"description": "Bread", "unitPrice": "2.50", "extendedPrice": "2.50"}, + ], + } + purchase = normalize_receipt(raw, user_id, store_id) + assert purchase.receipt_id == "REC-001" + assert purchase.purchase_date == date(2026, 3, 15) + assert purchase.total == Decimal("25.47") + assert purchase.subtotal == Decimal("23.00") + assert purchase.tax == Decimal("2.47") + assert purchase.savings_total == Decimal("3.00") + assert len(purchase.items) == 2 + assert purchase.items[0].product_name_raw == "Milk" + assert purchase.raw_data == raw + + def test_alternate_receipt_fields(self): + user_id = str(uuid.uuid4()) + store_id = str(uuid.uuid4()) + raw = { + "receipt_id": "REC-002", + "purchaseDate": "2026-03-14", + "totalAmount": "10.00", + "taxAmount": "0.75", + "totalSavings": "1.50", + "items": [], + } + purchase = normalize_receipt(raw, user_id, store_id) + assert purchase.receipt_id == "REC-002" + assert purchase.purchase_date == date(2026, 3, 14) + assert purchase.total == Decimal("10.00") + assert purchase.tax == Decimal("0.75") + assert purchase.savings_total == Decimal("1.50") + + def test_missing_date_defaults_to_today(self): + user_id = str(uuid.uuid4()) + store_id = str(uuid.uuid4()) + raw = {"total": "5.00", "items": []} + purchase = normalize_receipt(raw, user_id, store_id) + assert purchase.purchase_date == date.today() + + def test_generates_receipt_id_if_missing(self): + user_id = str(uuid.uuid4()) + store_id = str(uuid.uuid4()) + raw = {"total": "5.00", "date": "2026-03-15", "items": []} + purchase = normalize_receipt(raw, user_id, store_id) + assert purchase.receipt_id # Should be a generated UUID string + + def test_date_object_passthrough(self): + user_id = str(uuid.uuid4()) + store_id = str(uuid.uuid4()) + raw = {"date": date(2026, 1, 1), "total": "5.00", "items": []} + purchase = normalize_receipt(raw, user_id, store_id) + assert purchase.purchase_date == date(2026, 1, 1) diff --git a/tests/test_queue/__init__.py b/tests/test_queue/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_queue/test_email_queue.py b/tests/test_queue/test_email_queue.py new file mode 100644 index 0000000..05ffb51 --- /dev/null +++ b/tests/test_queue/test_email_queue.py @@ -0,0 +1,79 @@ +"""Tests for email queue using DragonflyDB Streams.""" + +import pytest +from fakeredis import aioredis as fake_aioredis + +from receiptwitness.queue.email import ( + CONSUMER_GROUP, + STREAM_KEY, + EmailJob, + ack_email, + consume_emails, + enqueue_email, + ensure_consumer_group, +) + + +@pytest.fixture +async def fake_client(): + """Yield a fake async Redis client.""" + client = fake_aioredis.FakeRedis(decode_responses=True) + yield client + await client.aclose() + + +@pytest.fixture +def sample_job(): + """Sample EmailJob for testing.""" + return EmailJob( + user_id="user-123", + sender="no-reply@kroger.com", + recipient="user@example.com", + subject="Kroger Receipt", + body_html="Receipt", + body_plain="Receipt", + received_at="2026-04-01T12:00:00Z", + message_id="msg-abc-123", + ) + + +@pytest.mark.asyncio +async def test_enqueue_and_consume(fake_client, sample_job): + """Enqueue a job, consume it, verify fields match.""" + msg_id = await enqueue_email(fake_client, sample_job) + assert msg_id is not None + + consumed = await consume_emails(fake_client, "test-worker", count=1, block_ms=100) + assert len(consumed) == 1 + consumed_id, consumed_job = consumed[0] + assert consumed_id == msg_id + assert consumed_job.user_id == sample_job.user_id + assert consumed_job.sender == sample_job.sender + assert consumed_job.recipient == sample_job.recipient + assert consumed_job.subject == sample_job.subject + assert consumed_job.message_id == sample_job.message_id + + +@pytest.mark.asyncio +async def test_ack_removes_from_pending(fake_client, sample_job): + """After ack, message is no longer pending.""" + msg_id = await enqueue_email(fake_client, sample_job) + + # Consume the message (moves it to pending) + consumed = await consume_emails(fake_client, "test-worker", count=1, block_ms=100) + assert len(consumed) == 1 + + # Acknowledge it + await ack_email(fake_client, msg_id) + + # Check pending count for this consumer group + pending = await fake_client.xpending(STREAM_KEY, CONSUMER_GROUP) + assert pending is None or pending["pending"] == 0 + + +@pytest.mark.asyncio +async def test_ensure_consumer_group_idempotent(fake_client): + """Calling ensure_consumer_group twice does not error.""" + await ensure_consumer_group(fake_client) + # Calling again should not raise + await ensure_consumer_group(fake_client) diff --git a/tests/test_regression/__init__.py b/tests/test_regression/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_regression/test_layout_changes.py b/tests/test_regression/test_layout_changes.py new file mode 100644 index 0000000..7843c43 --- /dev/null +++ b/tests/test_regression/test_layout_changes.py @@ -0,0 +1,435 @@ +"""Regression tests: graceful handling of page layout changes. + +Retailers frequently change their API response structures, field names, +and nesting. These tests verify that both parsers degrade gracefully when +encountering alternative or missing fields — producing valid output +instead of crashing. +""" + +from decimal import Decimal + +from receiptwitness.parsers.kroger import parse_kroger_receipt +from receiptwitness.parsers.meijer import parse_meijer_receipt +from receiptwitness.scrapers.base import RawReceipt + + +class TestKrogerFieldNameVariations: + """Kroger changes field names between app versions and API revisions.""" + + def test_alternative_item_key_line_items(self): + raw = RawReceipt( + receipt_id="KR-ALT-1", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "lineItems": [{"description": "MILK", "basePrice": 3.99, "totalPrice": 3.99}], + "total": 3.99, + } + }, + ) + result = parse_kroger_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "MILK" + + def test_alternative_item_key_receipt_items(self): + raw = RawReceipt( + receipt_id="KR-ALT-2", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "receiptItems": [ + {"description": "EGGS", "basePrice": 5.49, "totalPrice": 5.49} + ], + "total": 5.49, + } + }, + ) + result = parse_kroger_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "EGGS" + + def test_alternative_description_fields(self): + """Test productName and itemDescription fallbacks.""" + for field in ("productName", "itemDescription", "name"): + raw = RawReceipt( + receipt_id="KR-DESC", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [{field: "TEST PRODUCT", "basePrice": 1.00, "totalPrice": 1.00}], + "total": 1.00, + } + }, + ) + result = parse_kroger_receipt(raw) + assert result["items"][0]["product_name_raw"] == "TEST PRODUCT" + + def test_alternative_price_fields(self): + """Test unitPrice and price fallbacks for basePrice.""" + raw = RawReceipt( + receipt_id="KR-PRICE-1", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [{"description": "ITEM A", "unitPrice": 2.50, "totalPrice": 2.50}], + "total": 2.50, + } + }, + ) + result = parse_kroger_receipt(raw) + assert result["items"][0]["unit_price"] == Decimal("2.50") + + raw2 = RawReceipt( + receipt_id="KR-PRICE-2", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [{"description": "ITEM B", "price": 4.00, "totalPrice": 4.00}], + "total": 4.00, + } + }, + ) + result2 = parse_kroger_receipt(raw2) + assert result2["items"][0]["unit_price"] == Decimal("4.00") + + def test_alternative_total_fields(self): + """Test orderTotal, grandTotal fallbacks.""" + for field in ("orderTotal", "grandTotal"): + raw = RawReceipt( + receipt_id="KR-TOT", + purchase_date="2026-03-12", + raw_data={field: 42.50, "detail": {}}, + ) + result = parse_kroger_receipt(raw) + assert result["total"] == Decimal("42.50") + + def test_alternative_savings_fields(self): + """Test youSaved and totalDiscount fallbacks.""" + raw = RawReceipt( + receipt_id="KR-SAV-1", + purchase_date="2026-03-12", + raw_data={"youSaved": 5.00, "detail": {}}, + ) + result = parse_kroger_receipt(raw) + assert result["savings_total"] == Decimal("5.00") + + def test_alternative_tax_field(self): + raw = RawReceipt( + receipt_id="KR-TAX", + purchase_date="2026-03-12", + raw_data={"salesTax": 3.25, "detail": {}}, + ) + result = parse_kroger_receipt(raw) + assert result["tax"] == Decimal("3.25") + + def test_alternative_quantity_field_qty(self): + raw = RawReceipt( + receipt_id="KR-QTY", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [ + {"description": "APPLES", "qty": 5, "basePrice": 1.00, "totalPrice": 5.00} + ], + "total": 5.00, + } + }, + ) + result = parse_kroger_receipt(raw) + assert result["items"][0]["quantity"] == Decimal("5") + + def test_alternative_upc_field_kroger_product_id(self): + raw = RawReceipt( + receipt_id="KR-UPC", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [ + { + "description": "ITEM", + "krogerProductId": "12345678", + "basePrice": 1.00, + "totalPrice": 1.00, + } + ], + "total": 1.00, + } + }, + ) + result = parse_kroger_receipt(raw) + assert result["items"][0]["upc"] == "12345678" + + def test_missing_extended_price_computed(self): + """When totalPrice is missing, extended_price = unit_price * quantity.""" + raw = RawReceipt( + receipt_id="KR-CALC", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [{"description": "EGGS", "basePrice": 5.49, "quantity": 2}], + "total": 10.98, + } + }, + ) + result = parse_kroger_receipt(raw) + assert result["items"][0]["extended_price"] == Decimal("5.49") * Decimal("2") + + +class TestMeijerFieldNameVariations: + """Meijer XHR endpoints may change field names between SPA versions.""" + + def test_alternative_item_key_line_items(self): + raw = RawReceipt( + receipt_id="MJ-ALT-1", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "lineItems": [{"description": "BANANAS", "price": 0.69, "extendedPrice": 0.69}], + "total": 0.69, + } + }, + ) + result = parse_meijer_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "BANANAS" + + def test_alternative_description_fields(self): + for field in ("itemDescription", "name"): + raw = RawReceipt( + receipt_id="MJ-DESC", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [{field: "TEST ITEM", "price": 1.00, "extendedPrice": 1.00}], + "total": 1.00, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["product_name_raw"] == "TEST ITEM" + + def test_alternative_price_field_unit_price(self): + raw = RawReceipt( + receipt_id="MJ-PRICE", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [{"description": "MILK", "unitPrice": 3.49, "totalPrice": 3.49}], + "total": 3.49, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["unit_price"] == Decimal("3.49") + + def test_alternative_extended_price_field_total_price(self): + raw = RawReceipt( + receipt_id="MJ-EXT", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [{"description": "CEREAL", "price": 4.99, "totalPrice": 4.99}], + "total": 4.99, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["extended_price"] == Decimal("4.99") + + def test_alternative_total_field_transaction_total(self): + raw = RawReceipt( + receipt_id="MJ-TOT", + purchase_date="2026-03-10", + raw_data={"transactionTotal": 55.00, "detail": {}}, + ) + result = parse_meijer_receipt(raw) + assert result["total"] == Decimal("55.00") + + def test_alternative_loyalty_field(self): + raw = RawReceipt( + receipt_id="MJ-LOY", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [ + { + "description": "ITEM", + "price": 5.00, + "extendedPrice": 5.00, + "loyaltyDiscount": 0.50, + } + ], + "total": 5.00, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["loyalty_discount"] == Decimal("0.50") + + def test_alternative_upc_field_uppercase(self): + raw = RawReceipt( + receipt_id="MJ-UPC", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [ + { + "description": "ITEM", + "UPC": "0012345678", + "price": 1.00, + "extendedPrice": 1.00, + } + ], + "total": 1.00, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["upc"] == "12345678" + + def test_alternative_category_field(self): + raw = RawReceipt( + receipt_id="MJ-CAT", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [ + { + "description": "ITEM", + "price": 1.00, + "extendedPrice": 1.00, + "departmentDescription": "FROZEN", + } + ], + "total": 1.00, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["category_raw"] == "FROZEN" + + def test_missing_extended_price_computed(self): + raw = RawReceipt( + receipt_id="MJ-CALC", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [{"description": "MILK", "price": 3.49, "quantity": 2}], + "total": 6.98, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["extended_price"] == Decimal("3.49") * Decimal("2") + + def test_missing_description_fallback(self): + raw = RawReceipt( + receipt_id="MJ-NODESC", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [{"price": 1.00, "extendedPrice": 1.00}], + "total": 1.00, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["product_name_raw"] == "UNKNOWN ITEM" + + +class TestMixedFieldVersions: + """Test receipts that mix field naming conventions (happens during rollouts).""" + + def test_kroger_mixed_item_fields(self): + """Some items use old names, some use new names in same receipt.""" + raw = RawReceipt( + receipt_id="KR-MIX", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [ + {"description": "OLD STYLE", "basePrice": 2.00, "totalPrice": 2.00}, + {"productName": "NEW STYLE", "unitPrice": 3.00, "extendedAmount": 3.00}, + ], + "total": 5.00, + } + }, + ) + result = parse_kroger_receipt(raw) + assert len(result["items"]) == 2 + assert result["items"][0]["product_name_raw"] == "OLD STYLE" + assert result["items"][0]["unit_price"] == Decimal("2.00") + assert result["items"][1]["product_name_raw"] == "NEW STYLE" + assert result["items"][1]["unit_price"] == Decimal("3.00") + + def test_kroger_completely_unknown_structure_no_crash(self): + """Receipt with unrecognized structure should return empty items.""" + raw = RawReceipt( + receipt_id="KR-UNKNOWN", + purchase_date="2026-03-12", + raw_data={"something_unexpected": [1, 2, 3], "detail": {"foo": "bar"}}, + ) + result = parse_kroger_receipt(raw) + assert result["receipt_id"] == "KR-UNKNOWN" + assert result["items"] == [] + + def test_meijer_completely_unknown_structure_no_crash(self): + raw = RawReceipt( + receipt_id="MJ-UNKNOWN", + purchase_date="2026-03-10", + raw_data={"something_unexpected": [1, 2, 3], "detail": {"foo": "bar"}}, + ) + result = parse_meijer_receipt(raw) + assert result["receipt_id"] == "MJ-UNKNOWN" + assert result["items"] == [] + + def test_kroger_null_fields_no_crash(self): + """Fields with None values should be handled gracefully.""" + raw = RawReceipt( + receipt_id="KR-NULL", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [ + { + "description": "ITEM", + "basePrice": None, + "totalPrice": None, + "quantity": None, + "upc": None, + "department": None, + } + ], + "total": None, + "subtotal": None, + "tax": None, + } + }, + ) + result = parse_kroger_receipt(raw) + assert result["items"][0]["product_name_raw"] == "ITEM" + assert result["items"][0]["unit_price"] == Decimal("0") + + def test_meijer_null_fields_no_crash(self): + raw = RawReceipt( + receipt_id="MJ-NULL", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [ + { + "description": "ITEM", + "price": None, + "extendedPrice": None, + "quantity": None, + "upc": None, + "category": None, + } + ], + "total": None, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["product_name_raw"] == "ITEM" + assert result["items"][0]["unit_price"] == Decimal("0") diff --git a/tests/test_regression/test_rate_limiting.py b/tests/test_regression/test_rate_limiting.py new file mode 100644 index 0000000..1c55495 --- /dev/null +++ b/tests/test_regression/test_rate_limiting.py @@ -0,0 +1,365 @@ +"""Regression tests: rate limiting and retry behavior. + +Validates that scrapers enforce human-like delays between requests +and handle rate-limit/error responses gracefully without infinite retries. +""" + +from datetime import UTC, datetime, timedelta +from unittest.mock import AsyncMock, patch + +import pytest + +from receiptwitness.scrapers.base import SessionData +from receiptwitness.scrapers.kroger import DEFAULT_USER_AGENT, KrogerScraper +from receiptwitness.scrapers.meijer import MeijerScraper + + +class TestHumanDelayBehavior: + """Verify that human_delay respects configured bounds.""" + + @pytest.mark.asyncio + async def test_delay_within_bounds(self): + """human_delay should sleep between min_ms/1000 and max_ms/1000 seconds.""" + scraper = KrogerScraper() + sleep_path = "receiptwitness.scrapers.base.asyncio.sleep" + with patch(sleep_path, new_callable=AsyncMock) as mock_sleep: + await scraper.human_delay(100, 200) + mock_sleep.assert_called_once() + delay = mock_sleep.call_args[0][0] + assert 0.1 <= delay <= 0.2 + + @pytest.mark.asyncio + async def test_delay_uses_settings_defaults(self): + """Without explicit args, should use settings.min/max_request_delay_ms.""" + scraper = MeijerScraper() + sleep_path = "receiptwitness.scrapers.base.asyncio.sleep" + with ( + patch("receiptwitness.scrapers.base.settings") as mock_settings, + patch(sleep_path, new_callable=AsyncMock) as mock_sleep, + ): + mock_settings.min_request_delay_ms = 1000 + mock_settings.max_request_delay_ms = 5000 + await scraper.human_delay() + mock_sleep.assert_called_once() + delay = mock_sleep.call_args[0][0] + assert 1.0 <= delay <= 5.0 + + @pytest.mark.asyncio + async def test_delay_is_randomized(self): + """Multiple calls should produce different delays (probabilistic).""" + scraper = KrogerScraper() + delays = [] + sleep_path2 = "receiptwitness.scrapers.base.asyncio.sleep" + with patch(sleep_path2, new_callable=AsyncMock) as mock_sleep: + for _ in range(20): + await scraper.human_delay(100, 5000) + delays.append(mock_sleep.call_args[0][0]) + # With range 100-5000ms, 20 calls should have at least 2 distinct values + assert len(set(delays)) >= 2 + + +class TestKrogerRateLimiting: + """Verify Kroger scraper calls human_delay between receipt fetches.""" + + @pytest.mark.asyncio + async def test_delay_called_between_receipts(self): + """Scraper must call human_delay for each receipt detail fetch.""" + scraper = KrogerScraper() + valid_session = SessionData( + cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=datetime.now(UTC) + timedelta(hours=2), + ) + + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "orders": [ + { + "orderId": f"KR-{i}", + "purchaseDate": "2026-03-10T14:00:00Z", + "storeNumber": "357", + } + for i in range(3) + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response] + [mock_detail_response] * 3) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock) as mock_delay, + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + + assert len(receipts) == 3 + # human_delay called at least once per receipt (after initial page nav) + # Plus once for the initial navigation delay + assert mock_delay.call_count >= 3 + + +class TestMeijerRateLimiting: + """Verify Meijer scraper calls human_delay between receipt fetches.""" + + @pytest.mark.asyncio + async def test_delay_called_between_receipts(self): + scraper = MeijerScraper() + valid_session = SessionData( + cookies=[{"name": "s", "value": "v", "domain": ".meijer.com", "path": "/"}], + user_agent="test", + created_at=datetime.now(UTC), + expires_at=datetime.now(UTC) + timedelta(hours=4), + ) + + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "transactions": [ + { + "transactionId": f"TXN-{i}", + "transactionDate": "2026-03-10T14:00:00Z", + "storeNumber": "42", + } + for i in range(3) + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response] + [mock_detail_response] * 3) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock) as mock_delay, + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + + assert len(receipts) == 3 + assert mock_delay.call_count >= 3 + + +class TestGracefulErrorRecovery: + """Scrapers should not retry endlessly on errors.""" + + @pytest.mark.asyncio + async def test_kroger_api_500_returns_empty_not_retry(self): + """500 error should return empty list, not retry.""" + scraper = KrogerScraper() + valid_session = SessionData( + cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=datetime.now(UTC) + timedelta(hours=2), + ) + + mock_api_response = AsyncMock() + mock_api_response.ok = False + mock_api_response.status = 500 + mock_api_response.status_text = "Internal Server Error" + + mock_request = AsyncMock() + mock_request.get = AsyncMock(return_value=mock_api_response) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert receipts == [] + # Should only call the API once — no retries + assert mock_request.get.call_count == 1 + + @pytest.mark.asyncio + async def test_kroger_429_returns_empty_not_retry(self): + """Rate limit (429) should return empty, not retry.""" + scraper = KrogerScraper() + valid_session = SessionData( + cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=datetime.now(UTC) + timedelta(hours=2), + ) + + mock_api_response = AsyncMock() + mock_api_response.ok = False + mock_api_response.status = 429 + mock_api_response.status_text = "Too Many Requests" + + mock_request = AsyncMock() + mock_request.get = AsyncMock(return_value=mock_api_response) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert receipts == [] + assert mock_request.get.call_count == 1 + + @pytest.mark.asyncio + async def test_meijer_detail_exception_continues(self): + """Exception fetching one receipt detail should not abort remaining receipts.""" + scraper = MeijerScraper() + valid_session = SessionData( + cookies=[{"name": "s", "value": "v", "domain": ".meijer.com", "path": "/"}], + user_agent="test", + created_at=datetime.now(UTC), + expires_at=datetime.now(UTC) + timedelta(hours=4), + ) + + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "transactions": [ + { + "transactionId": "TXN-1", + "transactionDate": "2026-03-10T14:00:00Z", + "storeNumber": "42", + }, + { + "transactionId": "TXN-2", + "transactionDate": "2026-03-11T10:00:00Z", + "storeNumber": "42", + }, + ] + } + ) + + # First detail call raises exception, second succeeds + mock_detail_fail = AsyncMock() + mock_detail_fail.ok = False + mock_detail_fail.status = 500 + + mock_detail_ok = AsyncMock() + mock_detail_ok.ok = True + mock_detail_ok.json = AsyncMock(return_value={"items": []}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock( + side_effect=[mock_api_response, mock_detail_fail, mock_detail_ok] + ) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + + # Both receipts should be returned — the first with empty detail + assert len(receipts) == 2 + assert receipts[0].raw_data.get("detail") == {} + assert receipts[1].receipt_id == "TXN-2" diff --git a/tests/test_regression/test_schema_validation.py b/tests/test_regression/test_schema_validation.py new file mode 100644 index 0000000..8dfb10e --- /dev/null +++ b/tests/test_regression/test_schema_validation.py @@ -0,0 +1,364 @@ +"""Regression tests: scraper output matches expected schema. + +Validates that parsed receipts from both Kroger and Meijer conform to the +PurchaseCreate schema contract. Uses recorded fixtures to ensure outputs +remain stable across code changes. +""" + +from decimal import Decimal + +from receiptwitness.parsers.kroger import parse_kroger_receipt +from receiptwitness.parsers.meijer import parse_meijer_receipt +from receiptwitness.scrapers.base import RawReceipt + +# Required top-level keys in a parsed receipt +RECEIPT_REQUIRED_KEYS = {"receipt_id", "purchase_date", "total", "items", "raw_data"} +RECEIPT_OPTIONAL_KEYS = {"subtotal", "tax", "savings_total", "source_url"} + +# Required keys in each parsed item +ITEM_REQUIRED_KEYS = { + "product_name_raw", + "upc", + "quantity", + "unit_price", + "extended_price", +} +ITEM_OPTIONAL_KEYS = { + "regular_price", + "sale_price", + "coupon_discount", + "loyalty_discount", + "category_raw", +} + + +def _validate_receipt_schema(result: dict) -> None: + """Assert that a parsed receipt dict conforms to the expected schema.""" + # All required keys present + for key in RECEIPT_REQUIRED_KEYS: + assert key in result, f"Missing required key: {key}" + + # Types + assert isinstance(result["receipt_id"], str) + assert isinstance(result["purchase_date"], str) + assert isinstance(result["total"], Decimal) + assert isinstance(result["items"], list) + assert isinstance(result["raw_data"], dict) + + # Optional keys should be correct types when present + if result.get("subtotal") is not None: + assert isinstance(result["subtotal"], Decimal) + if result.get("tax") is not None: + assert isinstance(result["tax"], Decimal) + if result.get("savings_total") is not None: + assert isinstance(result["savings_total"], Decimal) + if result.get("source_url") is not None: + assert isinstance(result["source_url"], str) + + # No unexpected keys + all_keys = RECEIPT_REQUIRED_KEYS | RECEIPT_OPTIONAL_KEYS + for key in result: + assert key in all_keys, f"Unexpected key in receipt: {key}" + + +def _validate_item_schema(item: dict) -> None: + """Assert that a parsed item dict conforms to the expected schema.""" + for key in ITEM_REQUIRED_KEYS: + assert key in item, f"Missing required item key: {key}" + + assert isinstance(item["product_name_raw"], str) + assert len(item["product_name_raw"]) > 0 + assert isinstance(item["quantity"], Decimal) + assert isinstance(item["unit_price"], Decimal) + assert isinstance(item["extended_price"], Decimal) + + # UPC can be None or str + if item["upc"] is not None: + assert isinstance(item["upc"], str) + # UPC should not have leading zeros (stripped during parsing) + assert not item["upc"].startswith("0"), f"UPC has leading zeros: {item['upc']}" + + # Optional Decimal fields + for opt_key in ("regular_price", "sale_price", "coupon_discount", "loyalty_discount"): + if item.get(opt_key) is not None: + assert isinstance(item[opt_key], Decimal), f"{opt_key} should be Decimal" + + if item.get("category_raw") is not None: + assert isinstance(item["category_raw"], str) + + # No unexpected keys + all_keys = ITEM_REQUIRED_KEYS | ITEM_OPTIONAL_KEYS + for key in item: + assert key in all_keys, f"Unexpected key in item: {key}" + + +class TestKrogerSchemaValidation: + def test_full_receipt_schema(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + store_number="00357", + raw_data=kroger_receipt_data, + source_url="https://www.kroger.com/atlas/v1/receipt/api?orderId=KR-2026-0312-4471", + ) + result = parse_kroger_receipt(raw) + _validate_receipt_schema(result) + for item in result["items"]: + _validate_item_schema(item) + + def test_item_count_excludes_voided_and_returned(self, kroger_receipt_data): + """Fixture has 10 items, 2 should be excluded (voided + returned).""" + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + assert len(result["items"]) == 8 + + def test_totals_are_positive_decimals(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + assert result["total"] > Decimal("0") + assert result["subtotal"] > Decimal("0") + assert result["tax"] > Decimal("0") + assert result["savings_total"] > Decimal("0") + + def test_receipt_id_preserved(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + assert result["receipt_id"] == "KR-2026-0312-4471" + + def test_known_product_prices(self, kroger_receipt_data): + """Verify specific products produce correct price extraction.""" + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + items_by_name = {i["product_name_raw"]: i for i in result["items"]} + + # Milk: $3.99, regular $4.29 + milk = items_by_name["KROGER WHOLE MILK GAL"] + assert milk["unit_price"] == Decimal("3.99") + assert milk["regular_price"] == Decimal("4.29") + assert milk["sale_price"] == Decimal("3.99") + + # Eggs: qty 2, $5.49 each, total $10.98 + eggs = items_by_name["SIMPLE TRUTH ORG EGGS 12CT"] + assert eggs["quantity"] == Decimal("2") + assert eggs["unit_price"] == Decimal("5.49") + assert eggs["extended_price"] == Decimal("10.98") + + # Deli turkey: weighted item, 0.68 lb + turkey = items_by_name["KROGER DELI TURKEY BREAST"] + assert turkey["quantity"] == Decimal("0.68") + assert turkey["upc"] is None + + def test_multi_quantity_item_correct(self, kroger_receipt_data): + """Pasta is qty=3, unit=$2.49, total=$7.47.""" + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + pasta = [i for i in result["items"] if "PASTA" in i["product_name_raw"]][0] + assert pasta["quantity"] == Decimal("3") + assert pasta["unit_price"] == Decimal("2.49") + assert pasta["extended_price"] == Decimal("7.47") + + def test_coupon_discount_captured(self, kroger_receipt_data): + """Tide Pods has $2.00 coupon.""" + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + tide = [i for i in result["items"] if "TIDE" in i["product_name_raw"]][0] + assert tide["coupon_discount"] == Decimal("2.00") + + +class TestMeijerSchemaValidation: + def test_full_receipt_schema(self, meijer_receipt_data): + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + store_number="42", + raw_data=meijer_receipt_data, + source_url="https://www.meijer.com/bin/meijer/profile/receipt?receiptId=TXN-2026-0310-001", + ) + result = parse_meijer_receipt(raw) + _validate_receipt_schema(result) + for item in result["items"]: + _validate_item_schema(item) + + def test_item_count_excludes_voided(self, meijer_receipt_data): + """Fixture has 6 items, 1 should be excluded (voided soda).""" + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + assert len(result["items"]) == 5 + + def test_totals_are_positive_decimals(self, meijer_receipt_data): + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + assert result["total"] > Decimal("0") + assert result["subtotal"] > Decimal("0") + assert result["tax"] > Decimal("0") + assert result["savings_total"] > Decimal("0") + + def test_receipt_id_preserved(self, meijer_receipt_data): + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + assert result["receipt_id"] == "TXN-2026-0310-001" + + def test_known_product_prices(self, meijer_receipt_data): + """Verify specific Meijer products produce correct price extraction.""" + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + items_by_name = {i["product_name_raw"]: i for i in result["items"]} + + # Bananas: $0.69 + bananas = items_by_name["ORGANIC BANANAS"] + assert bananas["unit_price"] == Decimal("0.69") + assert bananas["mperks_discount"] if "mperks_discount" in bananas else True + assert bananas["loyalty_discount"] == Decimal("0.10") + + # Milk: qty 2, $3.49 each, total $6.98 + milk = items_by_name["MEIJER 2% MILK GAL"] + assert milk["quantity"] == Decimal("2") + assert milk["unit_price"] == Decimal("3.49") + assert milk["extended_price"] == Decimal("6.98") + + # Weighted deli turkey: 0.75 lb at $8.99/lb + turkey = items_by_name["WEIGHTED DELI TURKEY"] + assert turkey["quantity"] == Decimal("0.75") + assert turkey["upc"] is None + + def test_mperks_discount_captured(self, meijer_receipt_data): + """Paper towels has $1.00 mPerks discount.""" + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + towels = [i for i in result["items"] if "PAPER TOWELS" in i["product_name_raw"]][0] + assert towels["loyalty_discount"] == Decimal("1.00") + assert towels["coupon_discount"] == Decimal("1.00") + + def test_cheerios_coupon_discount(self, meijer_receipt_data): + """Cheerios has $0.50 coupon.""" + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + cheerios = [i for i in result["items"] if "CHEERIOS" in i["product_name_raw"]][0] + assert cheerios["coupon_discount"] == Decimal("0.50") + + +class TestEmptyAndEdgeCaseSchemas: + """Regression tests for edge-case receipts that should not crash.""" + + def test_kroger_empty_receipt(self): + raw = RawReceipt(receipt_id="KR-EMPTY", purchase_date="2026-03-12", raw_data={}) + result = parse_kroger_receipt(raw) + _validate_receipt_schema(result) + assert result["items"] == [] + assert result["total"] == Decimal("0") + + def test_meijer_empty_receipt(self): + raw = RawReceipt(receipt_id="MJ-EMPTY", purchase_date="2026-03-10", raw_data={}) + result = parse_meijer_receipt(raw) + _validate_receipt_schema(result) + assert result["items"] == [] + assert result["total"] == Decimal("0") + + def test_kroger_receipt_no_detail(self): + raw = RawReceipt( + receipt_id="KR-NODET", + purchase_date="2026-03-12", + raw_data={"total": 50.00}, + ) + result = parse_kroger_receipt(raw) + _validate_receipt_schema(result) + assert result["items"] == [] + assert result["total"] == Decimal("50.00") + + def test_meijer_receipt_no_detail(self): + raw = RawReceipt( + receipt_id="MJ-NODET", + purchase_date="2026-03-10", + raw_data={"total": 30.00}, + ) + result = parse_meijer_receipt(raw) + _validate_receipt_schema(result) + assert result["items"] == [] + assert result["total"] == Decimal("30.00") + + def test_kroger_receipt_all_voided(self): + """A receipt where every item is voided should have 0 items.""" + raw = RawReceipt( + receipt_id="KR-ALLVOID", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [ + {"description": "VOIDED A", "basePrice": 5.0, "voided": True}, + {"description": "VOIDED B", "basePrice": 3.0, "status": "VOIDED"}, + {"description": "RETURNED C", "basePrice": 7.0, "status": "RETURNED"}, + {"description": "RETURNED D", "basePrice": 2.0, "returnFlag": True}, + ], + "total": 0, + } + }, + ) + result = parse_kroger_receipt(raw) + _validate_receipt_schema(result) + assert result["items"] == [] + + def test_meijer_receipt_all_voided(self): + raw = RawReceipt( + receipt_id="MJ-ALLVOID", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [ + {"description": "VOIDED A", "price": 5.0, "voided": True}, + {"description": "VOIDED B", "price": 3.0, "status": "VOIDED"}, + ], + "total": 0, + } + }, + ) + result = parse_meijer_receipt(raw) + _validate_receipt_schema(result) + assert result["items"] == [] diff --git a/tests/test_scrapers/__init__.py b/tests/test_scrapers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_scrapers/test_base.py b/tests/test_scrapers/test_base.py new file mode 100644 index 0000000..d0cabac --- /dev/null +++ b/tests/test_scrapers/test_base.py @@ -0,0 +1,58 @@ +"""Tests for the base scraper class.""" + +from datetime import datetime +from unittest.mock import patch + +import pytest + +from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData + + +class ConcreteScraper(BaseScraper): + """Concrete implementation for testing the abstract base.""" + + async def login(self, username, password): + return SessionData( + cookies=[], + user_agent="test", + created_at=datetime.now(), + ) + + async def check_session(self, session): + return True + + async def scrape_receipts(self, session, since=None): + return [] + + def parse_receipt(self, raw): + return {} + + +class TestBaseScraper: + @pytest.mark.asyncio + async def test_human_delay_respects_bounds(self): + scraper = ConcreteScraper() + with patch("receiptwitness.scrapers.base.asyncio.sleep") as mock_sleep: + mock_sleep.return_value = None + await scraper.human_delay(min_ms=100, max_ms=200) + call_args = mock_sleep.call_args[0][0] + assert 0.1 <= call_args <= 0.2 + + def test_raw_receipt_dataclass(self): + receipt = RawReceipt( + receipt_id="test-123", + purchase_date="2026-03-10", + store_number="42", + raw_data={"key": "value"}, + ) + assert receipt.receipt_id == "test-123" + assert receipt.raw_data == {"key": "value"} + + def test_session_data_defaults(self): + session = SessionData( + cookies=[], + user_agent="test", + created_at=datetime.now(), + ) + assert session.expires_at is None + assert session.extra == {} diff --git a/tests/test_scrapers/test_kroger_scraper.py b/tests/test_scrapers/test_kroger_scraper.py new file mode 100644 index 0000000..3a88516 --- /dev/null +++ b/tests/test_scrapers/test_kroger_scraper.py @@ -0,0 +1,574 @@ +"""Tests for the Kroger scraper. + +These tests mock Playwright to avoid requiring real Kroger credentials +or network access. They verify the scraper's control flow, session handling, +date filtering, and error resilience. +""" + +from datetime import UTC, datetime, timedelta +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from receiptwitness.scrapers.base import RawReceipt, SessionData +from receiptwitness.scrapers.kroger import ( + DEFAULT_TIMEZONE, + DEFAULT_USER_AGENT, + DEFAULT_VIEWPORT, + KROGER_BASE, + KROGER_LOGIN_PAGE, + KROGER_PURCHASE_HISTORY, + KrogerScraper, +) + + +@pytest.fixture +def scraper(): + return KrogerScraper() + + +@pytest.fixture +def valid_session(): + return SessionData( + cookies=[{"name": "session", "value": "abc123", "domain": ".kroger.com", "path": "/"}], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=datetime.now(UTC) + timedelta(hours=2), + extra={"retailer": "kroger"}, + ) + + +@pytest.fixture +def expired_session(): + return SessionData( + cookies=[{"name": "session", "value": "expired", "domain": ".kroger.com", "path": "/"}], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC) - timedelta(hours=4), + expires_at=datetime.now(UTC) - timedelta(hours=2), + ) + + +class TestKrogerScraperConstants: + def test_base_url(self): + assert KROGER_BASE == "https://www.kroger.com" + + def test_login_page(self): + assert KROGER_LOGIN_PAGE == "https://www.kroger.com/signin" + + def test_purchase_history_page(self): + assert KROGER_PURCHASE_HISTORY == "https://www.kroger.com/mypurchases" + + def test_default_user_agent_is_chrome(self): + assert "Chrome" in DEFAULT_USER_AGENT + assert "Windows" in DEFAULT_USER_AGENT + + def test_default_viewport_hd(self): + assert DEFAULT_VIEWPORT == {"width": 1920, "height": 1080} + + def test_default_timezone(self): + assert DEFAULT_TIMEZONE == "America/New_York" + + +class TestCheckSession: + @pytest.mark.asyncio + async def test_expired_session_returns_false(self, scraper, expired_session): + result = await scraper.check_session(expired_session) + assert result is False + + @pytest.mark.asyncio + async def test_no_expiry_checks_via_browser(self, scraper): + session = SessionData( + cookies=[], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=None, + ) + mock_page = AsyncMock() + mock_page.url = "https://www.kroger.com/account/dashboard" + mock_response = MagicMock() + mock_response.ok = True + mock_page.goto = AsyncMock(return_value=mock_response) + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw: + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + result = await scraper.check_session(session) + assert result is True + + @pytest.mark.asyncio + async def test_session_redirected_to_signin_returns_false(self, scraper): + session = SessionData( + cookies=[], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=None, + ) + mock_page = AsyncMock() + mock_page.url = "https://www.kroger.com/signin?redirectUrl=account" + mock_response = MagicMock() + mock_response.ok = True + mock_page.goto = AsyncMock(return_value=mock_response) + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw: + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + result = await scraper.check_session(session) + assert result is False + + +class TestLogin: + @pytest.mark.asyncio + async def test_login_returns_session_data(self, scraper): + mock_page = AsyncMock() + mock_page.url = "https://www.kroger.com/" + + # Mock locator chain + mock_email = AsyncMock() + mock_password = AsyncMock() + mock_button = AsyncMock() + mock_page.locator = MagicMock(side_effect=[mock_email, mock_password, mock_button]) + mock_page.wait_for_url = AsyncMock() + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.cookies = AsyncMock( + return_value=[ + {"name": "kroger_session", "value": "test123", "domain": ".kroger.com", "path": "/"} + ] + ) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + session = await scraper.login("user@test.com", "password123") + + assert isinstance(session, SessionData) + assert len(session.cookies) == 1 + assert session.cookies[0]["name"] == "kroger_session" + assert session.user_agent == DEFAULT_USER_AGENT + assert session.expires_at is not None + assert session.extra == {"retailer": "kroger"} + + +class TestScrapeReceipts: + @pytest.mark.asyncio + async def test_scrape_returns_receipts(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.status = 200 + mock_api_response.json = AsyncMock( + return_value={ + "orders": [ + { + "orderId": "KR-001", + "purchaseDate": "2026-03-10T14:00:00Z", + "storeNumber": "357", + }, + { + "orderId": "KR-002", + "purchaseDate": "2026-03-11T10:00:00Z", + "storeNumber": "357", + }, + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={"items": []}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock( + side_effect=[mock_api_response, mock_detail_response, mock_detail_response] + ) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + + assert len(receipts) == 2 + assert receipts[0].receipt_id == "KR-001" + assert receipts[1].receipt_id == "KR-002" + assert isinstance(receipts[0], RawReceipt) + + @pytest.mark.asyncio + async def test_scrape_filters_by_date(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "orders": [ + { + "orderId": "KR-OLD", + "purchaseDate": "2026-01-01T10:00:00Z", + "storeNumber": "357", + }, + { + "orderId": "KR-NEW", + "purchaseDate": "2026-03-15T10:00:00Z", + "storeNumber": "357", + }, + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + since = datetime(2026, 3, 1, tzinfo=UTC) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session, since=since) + + assert len(receipts) == 1 + assert receipts[0].receipt_id == "KR-NEW" + + @pytest.mark.asyncio + async def test_scrape_handles_api_failure(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = False + mock_api_response.status = 500 + mock_api_response.status_text = "Internal Server Error" + + mock_request = AsyncMock() + mock_request.get = AsyncMock(return_value=mock_api_response) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert receipts == [] + + @pytest.mark.asyncio + async def test_scrape_handles_unexpected_response(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock(return_value="not a dict") + + mock_request = AsyncMock() + mock_request.get = AsyncMock(return_value=mock_api_response) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert receipts == [] + + @pytest.mark.asyncio + async def test_scrape_alternative_field_names(self, scraper, valid_session): + """Kroger may use 'purchases' instead of 'orders'.""" + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "purchases": [ + { + "receiptId": "KR-ALT-001", + "transactionDate": "2026-03-10T14:00:00Z", + "divisionNumber": "014", + } + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + + assert len(receipts) == 1 + assert receipts[0].receipt_id == "KR-ALT-001" + + @pytest.mark.asyncio + async def test_scrape_skips_orders_without_id(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "orders": [ + {"purchaseDate": "2026-03-10T14:00:00Z"}, # no id + {"orderId": "KR-VALID", "purchaseDate": "2026-03-10T14:00:00Z"}, + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert len(receipts) == 1 + assert receipts[0].receipt_id == "KR-VALID" + + @pytest.mark.asyncio + async def test_scrape_skips_orders_with_null_id(self, scraper, valid_session): + """Ensure orderId: null doesn't produce receipt_id='None' (str(None) bug).""" + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "orders": [ + {"orderId": None, "receiptId": None, "purchaseDate": "2026-03-10T14:00:00Z"}, + {"orderId": "KR-REAL", "purchaseDate": "2026-03-10T14:00:00Z"}, + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert len(receipts) == 1 + assert receipts[0].receipt_id == "KR-REAL" + # Verify no receipt has the string "None" as its ID + assert all(r.receipt_id != "None" for r in receipts) + + +class TestParseReceipt: + def test_parse_receipt_delegates_to_parser(self, scraper): + raw = RawReceipt( + receipt_id="KR-001", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [ + { + "description": "TEST ITEM", + "basePrice": 5.00, + "totalPrice": 5.00, + } + ], + "total": 5.00, + } + }, + ) + result = scraper.parse_receipt(raw) + assert result["receipt_id"] == "KR-001" + assert len(result["items"]) == 1 + + def test_receipt_detail_failure_returns_empty(self, scraper): + """Verify receipt detail failures produce empty detail.""" + raw = RawReceipt( + receipt_id="KR-FAIL", + purchase_date="2026-03-12", + raw_data={"total": 10.00, "detail": {}}, + ) + result = scraper.parse_receipt(raw) + assert result["receipt_id"] == "KR-FAIL" + assert result["items"] == [] diff --git a/tests/test_scrapers/test_meijer_scraper.py b/tests/test_scrapers/test_meijer_scraper.py new file mode 100644 index 0000000..05664e1 --- /dev/null +++ b/tests/test_scrapers/test_meijer_scraper.py @@ -0,0 +1,585 @@ +"""Tests for the Meijer scraper. + +These tests mock Playwright to avoid requiring real Meijer credentials +or network access. They verify the scraper's control flow, session handling, +date filtering, and error resilience. +""" + +from datetime import UTC, datetime, timedelta +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from receiptwitness.scrapers.base import RawReceipt, SessionData +from receiptwitness.scrapers.meijer import ( + DEFAULT_TIMEZONE, + DEFAULT_USER_AGENT, + DEFAULT_VIEWPORT, + MEIJER_BASE, + MEIJER_LOGIN_PAGE, + MEIJER_MPERKS_HOME, + MEIJER_PURCHASE_HISTORY, + MeijerScraper, +) + + +@pytest.fixture +def scraper(): + return MeijerScraper() + + +@pytest.fixture +def valid_session(): + return SessionData( + cookies=[ + {"name": "meijer_session", "value": "abc123", "domain": ".meijer.com", "path": "/"} + ], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=datetime.now(UTC) + timedelta(hours=4), + ) + + +@pytest.fixture +def expired_session(): + return SessionData( + cookies=[ + {"name": "meijer_session", "value": "expired", "domain": ".meijer.com", "path": "/"} + ], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC) - timedelta(hours=8), + expires_at=datetime.now(UTC) - timedelta(hours=4), + ) + + +class TestMeijerScraperConstants: + def test_base_url(self): + assert MEIJER_BASE == "https://www.meijer.com" + + def test_login_page(self): + assert MEIJER_LOGIN_PAGE == "https://www.meijer.com/shopping/login.html" + + def test_mperks_home(self): + assert MEIJER_MPERKS_HOME == "https://www.meijer.com/mperks.html" + + def test_purchase_history_url(self): + assert ( + MEIJER_PURCHASE_HISTORY == "https://www.meijer.com/bin/meijer/profile/purchasehistory" + ) + + def test_default_user_agent_is_chrome(self): + assert "Chrome" in DEFAULT_USER_AGENT + assert "Windows" in DEFAULT_USER_AGENT + + def test_default_viewport_hd(self): + assert DEFAULT_VIEWPORT == {"width": 1920, "height": 1080} + + def test_default_timezone(self): + assert DEFAULT_TIMEZONE == "America/Detroit" + + +class TestCheckSession: + @pytest.mark.asyncio + async def test_expired_session_returns_false(self, scraper, expired_session): + result = await scraper.check_session(expired_session) + assert result is False + + @pytest.mark.asyncio + async def test_no_expiry_checks_via_browser(self, scraper): + session = SessionData( + cookies=[], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=None, + ) + mock_page = AsyncMock() + mock_page.url = "https://www.meijer.com/mperks.html" + mock_response = MagicMock() + mock_response.ok = True + mock_page.goto = AsyncMock(return_value=mock_response) + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw: + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + result = await scraper.check_session(session) + assert result is True + + @pytest.mark.asyncio + async def test_session_redirected_to_login_returns_false(self, scraper): + session = SessionData( + cookies=[], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=None, + ) + mock_page = AsyncMock() + mock_page.url = "https://www.meijer.com/shopping/login.html?redirect=mperks" + mock_response = MagicMock() + mock_response.ok = True + mock_page.goto = AsyncMock(return_value=mock_response) + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw: + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + result = await scraper.check_session(session) + assert result is False + + +class TestLogin: + @pytest.mark.asyncio + async def test_login_returns_session_data(self, scraper): + mock_page = AsyncMock() + mock_page.url = "https://www.meijer.com/mperks.html" + + # Mock locator chain + mock_email = AsyncMock() + mock_password = AsyncMock() + mock_button = AsyncMock() + mock_page.locator = MagicMock(side_effect=[mock_email, mock_password, mock_button]) + mock_page.wait_for_url = AsyncMock() + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.cookies = AsyncMock( + return_value=[ + {"name": "meijer_session", "value": "test456", "domain": ".meijer.com", "path": "/"} + ] + ) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + session = await scraper.login("user@test.com", "password123") + + assert isinstance(session, SessionData) + assert len(session.cookies) == 1 + assert session.cookies[0]["name"] == "meijer_session" + assert session.user_agent == DEFAULT_USER_AGENT + assert session.expires_at is not None + # Meijer sessions last 4 hours + assert session.expires_at > session.created_at + timedelta(hours=3) + + +class TestScrapeReceipts: + @pytest.mark.asyncio + async def test_scrape_returns_receipts(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.status = 200 + mock_api_response.json = AsyncMock( + return_value={ + "transactions": [ + { + "transactionId": "TXN-001", + "transactionDate": "2026-03-10T14:00:00Z", + "storeNumber": "42", + }, + { + "transactionId": "TXN-002", + "transactionDate": "2026-03-11T10:00:00Z", + "storeNumber": "42", + }, + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={"items": []}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock( + side_effect=[mock_api_response, mock_detail_response, mock_detail_response] + ) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + + assert len(receipts) == 2 + assert receipts[0].receipt_id == "TXN-001" + assert receipts[1].receipt_id == "TXN-002" + assert isinstance(receipts[0], RawReceipt) + + @pytest.mark.asyncio + async def test_scrape_filters_by_date(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "transactions": [ + { + "transactionId": "TXN-OLD", + "transactionDate": "2026-01-01T10:00:00Z", + "storeNumber": "42", + }, + { + "transactionId": "TXN-NEW", + "transactionDate": "2026-03-15T10:00:00Z", + "storeNumber": "42", + }, + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + since = datetime(2026, 3, 1, tzinfo=UTC) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session, since=since) + + assert len(receipts) == 1 + assert receipts[0].receipt_id == "TXN-NEW" + + @pytest.mark.asyncio + async def test_scrape_handles_api_failure(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = False + mock_api_response.status = 500 + mock_api_response.status_text = "Internal Server Error" + + mock_request = AsyncMock() + mock_request.get = AsyncMock(return_value=mock_api_response) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert receipts == [] + + @pytest.mark.asyncio + async def test_scrape_handles_unexpected_response(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock(return_value="not a dict") + + mock_request = AsyncMock() + mock_request.get = AsyncMock(return_value=mock_api_response) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert receipts == [] + + @pytest.mark.asyncio + async def test_scrape_alternative_field_names(self, scraper, valid_session): + """Meijer may use 'purchaseHistory' instead of 'transactions'.""" + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "purchaseHistory": [ + { + "receiptId": "MJ-ALT-001", + "purchaseDate": "2026-03-10T14:00:00Z", + "storeId": "99", + } + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + + assert len(receipts) == 1 + assert receipts[0].receipt_id == "MJ-ALT-001" + + @pytest.mark.asyncio + async def test_scrape_skips_transactions_without_id(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "transactions": [ + {"transactionDate": "2026-03-10T14:00:00Z"}, # no id + {"transactionId": "TXN-VALID", "transactionDate": "2026-03-10T14:00:00Z"}, + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert len(receipts) == 1 + assert receipts[0].receipt_id == "TXN-VALID" + + @pytest.mark.asyncio + async def test_scrape_receipt_detail_failure_returns_empty_detail(self, scraper, valid_session): + """Receipt detail API failure should not crash the scraper.""" + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "transactions": [ + { + "transactionId": "TXN-DETAIL-FAIL", + "transactionDate": "2026-03-10T14:00:00Z", + "storeNumber": "42", + } + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = False + mock_detail_response.status = 404 + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert len(receipts) == 1 + assert receipts[0].receipt_id == "TXN-DETAIL-FAIL" + assert receipts[0].raw_data.get("detail") == {} + + +class TestParseReceipt: + def test_parse_receipt_delegates_to_parser(self, scraper): + raw = RawReceipt( + receipt_id="TXN-001", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [ + { + "description": "TEST ITEM", + "price": 5.00, + "extendedPrice": 5.00, + } + ], + "total": 5.00, + } + }, + ) + result = scraper.parse_receipt(raw) + assert result["receipt_id"] == "TXN-001" + assert len(result["items"]) == 1 + + def test_receipt_detail_failure_returns_empty(self, scraper): + raw = RawReceipt( + receipt_id="TXN-FAIL", + purchase_date="2026-03-10", + raw_data={"total": 10.00, "detail": {}}, + ) + result = scraper.parse_receipt(raw) + assert result["receipt_id"] == "TXN-FAIL" + assert result["items"] == [] diff --git a/tests/test_session/__init__.py b/tests/test_session/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_session/test_encryption.py b/tests/test_session/test_encryption.py new file mode 100644 index 0000000..59a57fa --- /dev/null +++ b/tests/test_session/test_encryption.py @@ -0,0 +1,61 @@ +"""Tests for session encryption/decryption.""" + +from unittest.mock import patch + +import pytest +from cryptography.fernet import Fernet, InvalidToken + +from receiptwitness.session.encryption import decrypt_session_data, encrypt_session_data + +TEST_KEY = Fernet.generate_key().decode() + + +@pytest.fixture(autouse=True) +def _mock_encryption_key(): + with patch("receiptwitness.session.encryption.settings") as mock_settings: + mock_settings.session_encryption_key = TEST_KEY + yield + + +class TestEncryptDecrypt: + def test_roundtrip(self): + data = { + "cookies": [{"name": "session", "value": "abc123", "domain": ".meijer.com"}], + "user_agent": "Mozilla/5.0", + } + encrypted = encrypt_session_data(data) + assert isinstance(encrypted, str) + assert encrypted != str(data) + + decrypted = decrypt_session_data(encrypted) + assert decrypted == data + + def test_different_data_different_ciphertext(self): + data1 = {"key": "value1"} + data2 = {"key": "value2"} + enc1 = encrypt_session_data(data1) + enc2 = encrypt_session_data(data2) + assert enc1 != enc2 + + def test_decrypt_with_wrong_key_fails(self): + data = {"cookies": []} + encrypted = encrypt_session_data(data) + + wrong_key = Fernet.generate_key().decode() + with patch("receiptwitness.session.encryption.settings") as mock_settings: + mock_settings.session_encryption_key = wrong_key + with pytest.raises(InvalidToken): + decrypt_session_data(encrypted) + + def test_decrypt_tampered_data_fails(self): + data = {"cookies": []} + encrypted = encrypt_session_data(data) + tampered = encrypted[:-5] + "XXXXX" + with pytest.raises(Exception): + decrypt_session_data(tampered) + + def test_no_key_raises_error(self): + with patch("receiptwitness.session.encryption.settings") as mock_settings: + mock_settings.session_encryption_key = "" + with pytest.raises(ValueError, match="RW_SESSION_ENCRYPTION_KEY"): + encrypt_session_data({"test": True}) diff --git a/tests/test_session/test_manager.py b/tests/test_session/test_manager.py new file mode 100644 index 0000000..68e1015 --- /dev/null +++ b/tests/test_session/test_manager.py @@ -0,0 +1,102 @@ +"""Tests for session manager logic.""" + +from datetime import UTC, datetime, timedelta +from unittest.mock import AsyncMock, patch + +import pytest +from cryptography.fernet import Fernet + +from receiptwitness.scrapers.base import SessionData +from receiptwitness.session.manager import ( + get_valid_session, + session_from_db_record, + session_to_db_value, +) + +TEST_KEY = Fernet.generate_key().decode() + + +@pytest.fixture(autouse=True) +def _mock_encryption_key(): + with patch("receiptwitness.session.encryption.settings") as mock_settings: + mock_settings.session_encryption_key = TEST_KEY + yield + + +def _make_session(hours_until_expire: int = 4) -> SessionData: + now = datetime.now(UTC) + return SessionData( + cookies=[{"name": "sid", "value": "test", "domain": ".meijer.com"}], + user_agent="Mozilla/5.0", + created_at=now, + expires_at=now + timedelta(hours=hours_until_expire), + ) + + +class TestSessionSerialization: + def test_roundtrip(self): + session = _make_session() + db_value = session_to_db_value(session) + restored = session_from_db_record(db_value) + + assert restored is not None + assert restored.cookies == session.cookies + assert restored.user_agent == session.user_agent + + def test_none_returns_none(self): + assert session_from_db_record(None) is None + + def test_invalid_encrypted_returns_none(self): + assert session_from_db_record("garbage-data") is None + + +class TestGetValidSession: + @pytest.mark.asyncio + async def test_valid_existing_session(self): + session = _make_session() + db_value = session_to_db_value(session) + + scraper = AsyncMock() + scraper.check_session.return_value = True + + result, was_refreshed = await get_valid_session(scraper, db_value, "user", "pass") + assert not was_refreshed + assert result.cookies == session.cookies + scraper.login.assert_not_called() + + @pytest.mark.asyncio + async def test_expired_session_triggers_login(self): + session = _make_session(hours_until_expire=-1) # already expired + db_value = session_to_db_value(session) + + new_session = _make_session() + scraper = AsyncMock() + scraper.login.return_value = new_session + + result, was_refreshed = await get_valid_session(scraper, db_value, "user", "pass") + assert was_refreshed + scraper.login.assert_called_once_with("user", "pass") + + @pytest.mark.asyncio + async def test_no_existing_session_triggers_login(self): + new_session = _make_session() + scraper = AsyncMock() + scraper.login.return_value = new_session + + result, was_refreshed = await get_valid_session(scraper, None, "user", "pass") + assert was_refreshed + scraper.login.assert_called_once() + + @pytest.mark.asyncio + async def test_failed_session_check_triggers_login(self): + session = _make_session() + db_value = session_to_db_value(session) + + new_session = _make_session() + scraper = AsyncMock() + scraper.check_session.return_value = False + scraper.login.return_value = new_session + + result, was_refreshed = await get_valid_session(scraper, db_value, "user", "pass") + assert was_refreshed + scraper.login.assert_called_once() diff --git a/tests/test_worker/__init__.py b/tests/test_worker/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_worker/test_email_worker.py b/tests/test_worker/test_email_worker.py new file mode 100644 index 0000000..bc05724 --- /dev/null +++ b/tests/test_worker/test_email_worker.py @@ -0,0 +1,188 @@ +"""Tests for email_worker.""" + +from decimal import Decimal +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from fakeredis import aioredis as fake_aioredis + +from receiptwitness.parsers.email.base import EmailReceipt +from receiptwitness.queue.email import ( + EmailJob, +) +from receiptwitness.worker.email_worker import ( + process_job, + resolve_user, +) + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +async def fake_redis(): + """Fake async Redis client for queue testing.""" + client = fake_aioredis.FakeRedis(decode_responses=True) + yield client + await client.aclose() + + +@pytest.fixture +def sample_email_job(): + """Sample EmailJob matching DragonflyDB queue schema.""" + return EmailJob( + user_id="token-abc-123", + sender="no-reply@meijer.com", + recipient="user@example.com", + subject="Your Meijer Receipt", + body_html="Total: $42.00", + body_plain="Total: $42.00", + received_at="2026-04-01T12:00:00Z", + message_id="msg-xyz-789", + ) + + +@pytest.fixture +def sample_email(): + """Sample EmailReceipt for parser testing.""" + return EmailReceipt( + sender="no-reply@meijer.com", + recipient="user@example.com", + subject="Your Meijer Receipt", + body_html="Total: $42.00
Receipt #12345", + body_plain="Total: $42.00", + received_at="2026-04-01T12:00:00Z", + ) + + +# --------------------------------------------------------------------------- +# resolve_user tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_resolve_user_valid_token(): + """Valid token returns user_id string.""" + mock_session = AsyncMock() + mock_result = MagicMock() + mock_result.scalar_one_or_none.return_value = "user-uuid-42" + mock_session.execute.return_value = mock_result + mock_session.__aenter__ = AsyncMock(return_value=mock_session) + mock_session.__aexit__ = AsyncMock(return_value=None) + + factory = MagicMock(return_value=mock_session) + + with patch( + "receiptwitness.worker.email_worker.get_async_session_factory", + return_value=factory, + ): + user_id = await resolve_user("token-abc-123") + + assert user_id == "user-uuid-42" + factory.assert_called_once() + + +@pytest.mark.asyncio +async def test_resolve_user_invalid_token(): + """Invalid token returns None.""" + mock_session = AsyncMock() + mock_result = MagicMock() + mock_result.scalar_one_or_none.return_value = None + mock_session.execute.return_value = mock_result + mock_session.__aenter__ = AsyncMock(return_value=mock_session) + mock_session.__aexit__ = AsyncMock(return_value=None) + + factory = MagicMock(return_value=mock_session) + + with patch( + "receiptwitness.worker.email_worker.get_async_session_factory", + return_value=factory, + ): + user_id = await resolve_user("bad-token") + + assert user_id is None + + +# --------------------------------------------------------------------------- +# process_job tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_process_job_unknown_retailer(sample_email_job): + """Unknown retailer logs warning and returns True (ack, no retry).""" + unknown_job = EmailJob( + user_id="token-abc-123", + sender="no-reply@unknownretailer.com", + recipient="user@example.com", + subject="Receipt", + body_html="", + body_plain="", + received_at="2026-04-01T12:00:00Z", + message_id="msg-xyz-789", + ) + + with ( + patch( + "receiptwitness.worker.email_worker.resolve_user", + return_value="user-uuid-42", + ), + patch( + "receiptwitness.worker.email_worker.publish_receipt_ingested", + new_callable=AsyncMock, + ) as mock_publish, + ): + result = await process_job("msg-id-1", unknown_job) + + assert result is True + mock_publish.assert_not_called() + + +@pytest.mark.asyncio +async def test_process_job_success(sample_email_job, sample_email): + """Known retailer: full pipeline runs — parse, normalize, publish event.""" + parsed_data = { + "receipt_id": "RCP-999", + "purchase_date": "2026-04-01", + "total": Decimal("42.00"), + "items": [ + { + "product_name_raw": "ORGANIC BANANAS", + "quantity": Decimal("1"), + "unit_price": Decimal("0.69"), + "extended_price": Decimal("0.69"), + }, + ], + } + + mock_parser = MagicMock() + mock_parser.parse.return_value = parsed_data + + with ( + patch( + "receiptwitness.worker.email_worker.resolve_user", + return_value="user-uuid-42", + ), + patch.dict( + "receiptwitness.worker.email_worker.PARSERS", + {"meijer": mock_parser}, + clear=False, + ), + patch( + "receiptwitness.worker.email_worker.publish_receipt_ingested", + new_callable=AsyncMock, + ) as mock_publish, + ): + result = await process_job("msg-id-1", sample_email_job) + + assert result is True + mock_parser.parse.assert_called_once() + mock_publish.assert_called_once_with( + user_id="user-uuid-42", + store_slug="meijer", + purchase_id="RCP-999", + purchase_date="2026-04-01", + item_count=1, + total=Decimal("42.00"), + )