From 342906c9d178923d462a08aec35e486703366eba Mon Sep 17 00:00:00 2001 From: Coupon Carl Date: Sat, 28 Mar 2026 02:24:22 +0000 Subject: [PATCH] Squashed 'receiptwitness/' content from commit e8d374a git-subtree-dir: receiptwitness git-subtree-split: e8d374a89ed8978f429598e02d31b1c5963efe22 --- .dockerignore | 12 + .github/workflows/ci.yml | 168 +++++ .gitignore | 7 + CLAUDE.md | 227 +++++++ Dockerfile | 67 ++ pyproject.toml | 54 ++ renovate.json | 4 + src/receiptwitness/__init__.py | 1 + src/receiptwitness/api/__init__.py | 1 + src/receiptwitness/api/routes.py | 10 + src/receiptwitness/config.py | 26 + src/receiptwitness/events.py | 75 +++ src/receiptwitness/main.py | 8 + src/receiptwitness/parsers/__init__.py | 1 + src/receiptwitness/parsers/kroger.py | 148 +++++ src/receiptwitness/parsers/meijer.py | 138 +++++ src/receiptwitness/parsers/target.py | 191 ++++++ src/receiptwitness/pipeline/__init__.py | 30 + src/receiptwitness/pipeline/matching.py | 136 ++++ src/receiptwitness/pipeline/normalization.py | 155 +++++ src/receiptwitness/pipeline/receipt.py | 144 +++++ src/receiptwitness/scrapers/__init__.py | 1 + src/receiptwitness/scrapers/base.py | 72 +++ src/receiptwitness/scrapers/kroger.py | 344 ++++++++++ src/receiptwitness/scrapers/meijer.py | 301 +++++++++ src/receiptwitness/scrapers/target.py | 326 ++++++++++ src/receiptwitness/session/__init__.py | 1 + src/receiptwitness/session/encryption.py | 52 ++ src/receiptwitness/session/manager.py | 81 +++ tests/conftest.py | 29 + tests/fixtures/kroger_receipt.json | 131 ++++ tests/fixtures/meijer_receipt.json | 85 +++ tests/fixtures/target_receipt.json | 140 +++++ tests/test_parsers/__init__.py | 0 tests/test_parsers/test_kroger_parser.py | 399 ++++++++++++ tests/test_parsers/test_meijer_parser.py | 174 ++++++ tests/test_parsers/test_target_parser.py | 471 ++++++++++++++ tests/test_pipeline/__init__.py | 0 tests/test_pipeline/conftest.py | 23 + tests/test_pipeline/test_matching.py | 161 +++++ tests/test_pipeline/test_normalization.py | 158 +++++ tests/test_pipeline/test_receipt.py | 204 ++++++ tests/test_regression/__init__.py | 0 tests/test_regression/test_layout_changes.py | 435 +++++++++++++ tests/test_regression/test_rate_limiting.py | 365 +++++++++++ .../test_regression/test_schema_validation.py | 364 +++++++++++ tests/test_scrapers/__init__.py | 0 tests/test_scrapers/test_base.py | 58 ++ tests/test_scrapers/test_kroger_scraper.py | 574 +++++++++++++++++ tests/test_scrapers/test_meijer_scraper.py | 585 ++++++++++++++++++ tests/test_session/__init__.py | 0 tests/test_session/test_encryption.py | 61 ++ tests/test_session/test_manager.py | 102 +++ 53 files changed, 7300 insertions(+) create mode 100644 .dockerignore create mode 100644 .github/workflows/ci.yml create mode 100644 .gitignore create mode 100644 CLAUDE.md create mode 100644 Dockerfile create mode 100644 pyproject.toml create mode 100644 renovate.json create mode 100644 src/receiptwitness/__init__.py create mode 100644 src/receiptwitness/api/__init__.py create mode 100644 src/receiptwitness/api/routes.py create mode 100644 src/receiptwitness/config.py create mode 100644 src/receiptwitness/events.py create mode 100644 src/receiptwitness/main.py create mode 100644 src/receiptwitness/parsers/__init__.py create mode 100644 src/receiptwitness/parsers/kroger.py create mode 100644 src/receiptwitness/parsers/meijer.py create mode 100644 src/receiptwitness/parsers/target.py create mode 100644 src/receiptwitness/pipeline/__init__.py create mode 100644 src/receiptwitness/pipeline/matching.py create mode 100644 src/receiptwitness/pipeline/normalization.py create mode 100644 src/receiptwitness/pipeline/receipt.py create mode 100644 src/receiptwitness/scrapers/__init__.py create mode 100644 src/receiptwitness/scrapers/base.py create mode 100644 src/receiptwitness/scrapers/kroger.py create mode 100644 src/receiptwitness/scrapers/meijer.py create mode 100644 src/receiptwitness/scrapers/target.py create mode 100644 src/receiptwitness/session/__init__.py create mode 100644 src/receiptwitness/session/encryption.py create mode 100644 src/receiptwitness/session/manager.py create mode 100644 tests/conftest.py create mode 100644 tests/fixtures/kroger_receipt.json create mode 100644 tests/fixtures/meijer_receipt.json create mode 100644 tests/fixtures/target_receipt.json create mode 100644 tests/test_parsers/__init__.py create mode 100644 tests/test_parsers/test_kroger_parser.py create mode 100644 tests/test_parsers/test_meijer_parser.py create mode 100644 tests/test_parsers/test_target_parser.py create mode 100644 tests/test_pipeline/__init__.py create mode 100644 tests/test_pipeline/conftest.py create mode 100644 tests/test_pipeline/test_matching.py create mode 100644 tests/test_pipeline/test_normalization.py create mode 100644 tests/test_pipeline/test_receipt.py create mode 100644 tests/test_regression/__init__.py create mode 100644 tests/test_regression/test_layout_changes.py create mode 100644 tests/test_regression/test_rate_limiting.py create mode 100644 tests/test_regression/test_schema_validation.py create mode 100644 tests/test_scrapers/__init__.py create mode 100644 tests/test_scrapers/test_base.py create mode 100644 tests/test_scrapers/test_kroger_scraper.py create mode 100644 tests/test_scrapers/test_meijer_scraper.py create mode 100644 tests/test_session/__init__.py create mode 100644 tests/test_session/test_encryption.py create mode 100644 tests/test_session/test_manager.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..289a751 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,12 @@ +__pycache__/ +*.pyc +.pytest_cache/ +*.egg-info/ +dist/ +.venv/ +.env +.git/ +.github/ +tests/ +*.md +renovate.json diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..785af69 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,168 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +concurrency: + group: ci-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: write + packages: write + +env: + REGISTRY: ghcr.io + IMAGE_NAME: cartsnitch/receiptwitness + +jobs: + lint: + runs-on: runners-cartsnitch + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + - name: Install cartsnitch-common from GitHub + run: pip install "cartsnitch-common @ git+https://github.com/cartsnitch/common.git@76685ed0384103228cd670b477b967e7752ebe6b" + - run: pip install ruff + - name: Ruff lint + run: ruff check . + - name: Ruff format check + run: ruff format --check . + + typecheck: + runs-on: runners-cartsnitch + continue-on-error: true + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + - name: Install cartsnitch-common from GitHub + run: pip install "cartsnitch-common @ git+https://github.com/cartsnitch/common.git@76685ed0384103228cd670b477b967e7752ebe6b" + - run: pip install -e ".[dev]" mypy + - name: Type check + run: mypy src/receiptwitness + + test: + runs-on: runners-cartsnitch + services: + postgres: + image: postgres:15-alpine + credentials: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + env: + POSTGRES_USER: cartsnitch + POSTGRES_PASSWORD: cartsnitch_test + POSTGRES_DB: cartsnitch_test + ports: + - 5432:5432 + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + redis: + image: redis:7-alpine + credentials: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + ports: + - 6379:6379 + options: >- + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + env: + DATABASE_URL: postgresql://cartsnitch:cartsnitch_test@localhost:5432/cartsnitch_test + REDIS_URL: redis://localhost:6379/0 + ENCRYPTION_KEY: dGVzdC1lbmNyeXB0aW9uLWtleS0xMjM0NTY3ODk= + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + - name: Install cartsnitch-common from GitHub + run: pip install "cartsnitch-common @ git+https://github.com/cartsnitch/common.git@76685ed0384103228cd670b477b967e7752ebe6b" + - run: pip install -e ".[dev]" + - name: Install Playwright browsers + run: playwright install chromium --with-deps + - name: Run tests + run: pytest --tb=short -q + + build-and-push: + runs-on: runners-cartsnitch + needs: [lint, test] + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Generate CalVer tag + id: calver + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + run: | + DATE_TAG=$(date -u +%Y.%m.%d) + EXISTING=$(git tag -l "v${DATE_TAG}*" | sort -V | tail -1) + if [ -z "$EXISTING" ]; then + VERSION="$DATE_TAG" + elif [ "$EXISTING" = "v${DATE_TAG}" ]; then + VERSION="${DATE_TAG}.2" + else + BUILD_NUM=$(echo "$EXISTING" | sed "s/v${DATE_TAG}\.//") + VERSION="${DATE_TAG}.$((BUILD_NUM + 1))" + fi + echo "version=$VERSION" >> "$GITHUB_OUTPUT" + echo "CalVer tag: $VERSION" + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=sha,prefix=sha- + type=raw,value=${{ steps.calver.outputs.version }},enable=${{ github.ref == 'refs/heads/main' }} + type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }} + + - name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: . + push: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + target: prod + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Create git tag + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + run: | + git tag "v${{ steps.calver.outputs.version }}" + git push origin "v${{ steps.calver.outputs.version }}" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..687387e --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +__pycache__/ +*.pyc +.pytest_cache/ +*.egg-info/ +dist/ +.venv/ +.env diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..255b742 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,227 @@ +# ReceiptWitness — CartSnitch Receipt Ingestion Service + +## Project Context + +CartSnitch is a self-hosted grocery price intelligence platform built as a polyrepo microservices architecture. This repo (`cartsnitch/receiptwitness`) is the receipt/purchase history ingestion service. + +**GitHub org:** github.com/cartsnitch +**Domain:** cartsnitch.com + +### CartSnitch Services + +| Repo | Service | Purpose | +|------|---------|---------| +| `cartsnitch/common` | — | Shared models, schemas, utilities | +| `cartsnitch/receiptwitness` | ReceiptWitness | Purchase data ingestion via retailer scrapers (this repo) | +| `cartsnitch/api` | API Gateway | Frontend-facing REST API | +| `cartsnitch/cartsnitch` | Frontend | React PWA (mobile-first) | +| `cartsnitch/stickershock` | StickerShock | Price increase detection & CPI comparison | +| `cartsnitch/shrinkray` | ShrinkRay | Shrinkflation monitoring | +| `cartsnitch/clipartist` | ClipArtist | Coupon/deal watching & shopping optimization | +| `cartsnitch/infra` | — | K8s manifests, Flux kustomizations | + +### Architecture Decisions + +- **Polyrepo:** Each service has its own repo, Dockerfile, CI/CD pipeline. +- **Shared DB:** One PostgreSQL cluster. This service writes to `purchases`, `purchase_items`, `price_history` tables. Models come from `cartsnitch-common`. +- **Inter-service comms:** REST (synchronous) + Redis pub/sub (async events). +- **Target scale:** 500–1,000 users. Each user has their own authenticated sessions to up to 3 retailers. + +## What This Service Does + +ReceiptWitness authenticates with grocery retailer web portals using per-user sessions, scrapes purchase history / receipt data, parses it into structured records, and writes it to the shared database. After ingestion, it publishes a `cartsnitch.receipts.ingested` event so downstream services (StickerShock, ClipArtist) can react. + +### Target Retailers (MVP) + +#### Meijer (mPerks) +- **Auth:** No public API. Session cookie-based auth on mperks.meijer.com. +- **Receipt location:** meijer.com/mperks/receipts-savings.html (or underlying XHR endpoints) +- **Approach:** Playwright login → capture session → hit receipt XHR endpoints directly. Map the API calls the frontend makes via browser dev tools network tab. +- **Prior art:** `dapperfu/python_Meijer` (requires MITM proxy for auth — avoid this pattern, prefer direct browser automation). +- **Data available:** Digital receipts appear ~15 minutes after purchase if mPerks ID was used at checkout. Includes item names, prices, discounts, savings. + +#### Kroger +- **Auth:** No public API for purchase history (that's behind Partner API). Session cookie-based auth on kroger.com. +- **Receipt location:** kroger.com/mypurchases +- **Approach:** Playwright login → scrape purchase history pages or intercept XHR endpoints. +- **Anti-bot:** Kroger uses Akamai Bot Manager. Aggressive headless browser detection. Need Playwright stealth, realistic fingerprinting, human-like interaction pacing. +- **Prior art:** `phyllis-vance/KrogerScrape` (.NET, old), `callaginn/kroger-sweeper` (Puppeteer/Node), `ThermoMan/Get-Kroger-Grocery-List` (Greasemonkey userscript). +- **Kroger public API:** Free developer account at developer.kroger.com provides product catalog data (`product.compact` scope) — useful for enriching scraped receipt data with UPCs, categories, product images. NOT useful for purchase history. +- **Data available:** Purchase history tied to Kroger Plus loyalty card. Shows items, prices, quantities. + +#### Target (Circle) +- **Auth:** Session-based auth on target.com. +- **Receipt location:** target.com account → Orders → In-store tab, or target.com/account/orders +- **Approach:** Playwright login → scrape in-store purchase history. +- **Data available:** ~1 year of history if user paid with a linked card, used the Target app wallet, or entered their Target Circle phone number at checkout. Includes item names, prices. + +## Tech Stack + +- Python 3.12+ +- Playwright (Python async API) for headless browser automation +- FastAPI (lightweight internal API for triggering scrapes, health checks, status) +- SQLAlchemy 2.0 (via `cartsnitch-common`) +- Redis (pub/sub event publishing) +- APScheduler or Celery (for scheduled scraping jobs) +- cryptography / Fernet (encrypting stored session data) + +## Repo Structure + +``` +receiptwitness/ +├── CLAUDE.md +├── README.md +├── pyproject.toml +├── Dockerfile # Playwright + Chromium headless +├── docker-compose.yml # Local dev (Postgres, Redis, this service) +├── src/ +│ └── receiptwitness/ +│ ├── __init__.py +│ ├── config.py # Service-specific settings +│ ├── main.py # FastAPI app + scheduler bootstrap +│ ├── scrapers/ +│ │ ├── __init__.py +│ │ ├── base.py # Abstract BaseScraper class +│ │ ├── meijer.py # Meijer/mPerks scraper +│ │ ├── kroger.py # Kroger scraper +│ │ └── target.py # Target/Circle scraper +│ ├── parsers/ +│ │ ├── __init__.py +│ │ ├── meijer.py # Parse raw Meijer receipt data → PurchaseItem records +│ │ ├── kroger.py +│ │ └── target.py +│ ├── session/ +│ │ ├── __init__.py +│ │ ├── manager.py # Session storage, retrieval, refresh logic +│ │ └── encryption.py # Encrypt/decrypt session cookies at rest +│ ├── scheduler.py # Scrape scheduling (per-user cron jobs) +│ ├── events.py # Publish receipt.ingested events to Redis +│ ├── api/ +│ │ ├── __init__.py +│ │ ├── routes.py # Internal API: trigger scrape, check status, health +│ │ └── auth.py # Internal service auth (API key or JWT) +│ └── enrichment.py # Optional: enrich receipt data via Kroger public API +└── tests/ + ├── conftest.py + ├── fixtures/ # Sample receipt HTML/JSON for testing parsers + │ ├── meijer_receipt.json + │ ├── kroger_receipt.html + │ └── target_receipt.html + ├── test_scrapers/ + ├── test_parsers/ + └── test_session/ +``` + +## Scraper Architecture + +### Base Scraper Pattern + +```python +class BaseScraper(ABC): + """All retailer scrapers implement this interface.""" + + @abstractmethod + async def login(self, credentials: UserStoreAccount) -> SessionData: ... + + @abstractmethod + async def check_session(self, session: SessionData) -> bool: ... + + @abstractmethod + async def scrape_receipts(self, session: SessionData, since: datetime | None) -> list[RawReceipt]: ... + + @abstractmethod + def parse_receipt(self, raw: RawReceipt) -> tuple[Purchase, list[PurchaseItem]]: ... +``` + +### Scraping Flow + +1. **Scheduler fires** for a user+store combination +2. **Load session** from `user_store_accounts` table (encrypted) +3. **Check session validity** — quick lightweight request to verify auth +4. **If expired:** launch Playwright, re-authenticate, save new session +5. **Scrape receipts** since `last_sync_at` timestamp +6. **Parse** raw data into `Purchase` and `PurchaseItem` records +7. **Deduplicate** — skip receipts already in DB (match on `receipt_id` per store) +8. **Write to DB** — insert new purchases and items +9. **Derive price_history** entries from purchase_items +10. **Publish event** — `cartsnitch.receipts.ingested` to Redis +11. **Update** `user_store_accounts.last_sync_at` + +### Session Management + +- Sessions (cookies, tokens) are encrypted at rest using Fernet symmetric encryption. +- The encryption key is provided via environment variable, not stored in the DB. +- Sessions are stored in the `user_store_accounts` table as encrypted JSONB. +- Each scrape attempt first checks if the existing session is valid before launching a full Playwright browser instance. +- When a session expires, the service needs the user's stored credentials OR a manual re-auth flow (the user logs in via the frontend, and we capture the session). + +### Anti-Bot Considerations + +- Use `playwright-stealth` or equivalent to mask automation signals. +- Set realistic viewport sizes, user agents, and locale settings. +- Add human-like delays between page navigations (randomized 1-5 seconds). +- For Kroger specifically (Akamai Bot Manager): may need to use non-headless mode on initial auth, or route through a persistent browser profile that has established trust. +- Rate limit scraping: no more than 1 scrape per user per store per hour. Default cadence: once daily. +- Store and reuse browser profiles/cookies to minimize fresh logins. + +### Dockerfile + +The Dockerfile must include Playwright and Chromium. Base image pattern: + +```dockerfile +FROM mcr.microsoft.com/playwright/python:v1.49.0-noble +# Install deps, copy code, etc. +``` + +This is a large image (~2GB) due to Chromium. Consider multi-stage builds if the final image can be slimmed down. + +## Internal API Endpoints + +This service exposes a lightweight internal API (not public-facing): + +- `GET /health` — health check +- `GET /status/{user_id}` — sync status per store for a user +- `POST /scrape/{user_id}/{store_slug}` — trigger an immediate scrape for a user+store +- `POST /scrape/{user_id}/all` — trigger scrape across all configured stores +- `GET /sessions/{user_id}` — list configured store sessions and their status + +The public-facing API gateway (`cartsnitch/api`) proxies user-facing requests to this service's internal API. + +## Events Published + +### `cartsnitch.receipts.ingested` + +Published after new receipt data is successfully written to the DB. + +```json +{ + "event_type": "cartsnitch.receipts.ingested", + "timestamp": "2026-03-15T12:00:00Z", + "service": "receiptwitness", + "payload": { + "user_id": "uuid", + "store_slug": "meijer", + "purchase_id": "uuid", + "purchase_date": "2026-03-14", + "item_count": 23, + "total": 87.42 + } +} +``` + +## Development Workflow + +- **Never push directly to main.** Always create feature branches and open PRs. +- Branch naming: `feature//` or `fix/` +- Use conventional commits: `feat:`, `fix:`, `refactor:`, `docs:`, `chore:` +- Test parsers with fixture data (sample receipts in `tests/fixtures/`). Scraper integration tests require real credentials and should be tagged/skipped in CI. +- Local dev: `docker-compose up` starts Postgres, Redis, and the service. Playwright runs inside the container. + +## Important Notes + +- The Playwright container image is large. On K8s, consider using a dedicated node or tolerating scheduling delays. +- Each user needs their own authenticated sessions. At 1,000 users × 3 stores = 3,000 sessions to manage. Sessions expire at different rates per retailer. +- Scraping must be respectful: randomized intervals, rate limiting, no parallel scraping of the same store for the same user. +- Receipt data structure varies significantly between retailers. The parsers must be robust and handle edge cases (returns, voided items, weighted produce, BOGO items, coupon stacking). +- Kroger's public API (`product.compact` scope) can be used to enrich scraped data with UPCs and product metadata after receipt parsing. This is optional but improves product normalization downstream. +- Store credentials for users should ideally NOT be stored by CartSnitch. Prefer a flow where the user authenticates in a controlled browser session, and we capture/store only the resulting session cookies. If credential storage is necessary, use strong encryption and make the tradeoffs clear to users. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..bb6300d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,67 @@ +# Stage 1: Build dependencies +FROM python:3.12-slim AS build + +WORKDIR /app + +# git is required to install cartsnitch-common from GitHub; build-essential and +# libpq-dev are needed to compile any C-extension wheels (e.g. psycopg2 fallback) +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + libpq-dev \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY pyproject.toml ./ +COPY src/ ./src/ + +# cartsnitch-common is not on PyPI — install it directly from GitHub, then +# install the rest of the package dependencies in a single resolver pass so +# pip can satisfy the cartsnitch-common>=0.1.0 constraint declared in +# pyproject.toml without hitting PyPI for it. +RUN pip install --no-cache-dir --prefix=/install \ + "cartsnitch-common @ git+https://github.com/cartsnitch/common.git@76685ed0384103228cd670b477b967e7752ebe6b" \ + . + +# Stage 2: Production image with Playwright + Chromium +FROM python:3.12-slim AS prod + +WORKDIR /app + +# Install Playwright system dependencies for Chromium +RUN apt-get update && apt-get install -y --no-install-recommends \ + libnss3 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libdrm2 \ + libxkbcommon0 \ + libxcomposite1 \ + libxdamage1 \ + libxrandr2 \ + libgbm1 \ + libpango-1.0-0 \ + libcairo2 \ + libasound2 \ + libxshmfence1 \ + libx11-xcb1 \ + libxcb-dri3-0 \ + fonts-liberation \ + && rm -rf /var/lib/apt/lists/* + +RUN adduser --system --group --uid 1000 app + +COPY --from=build /install /usr/local +COPY src/ ./src/ + +# Install Playwright Chromium browser (runs as root; /opt/playwright is world-readable) +RUN PLAYWRIGHT_BROWSERS_PATH=/opt/playwright playwright install chromium + +ENV PLAYWRIGHT_BROWSERS_PATH=/opt/playwright + +USER 1000 +EXPOSE 8000 + +HEALTHCHECK --interval=30s --timeout=3s \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" + +CMD ["uvicorn", "receiptwitness.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f32acfc --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "receiptwitness" +version = "0.1.0" +description = "CartSnitch receipt/purchase history ingestion service" +requires-python = ">=3.12" +dependencies = [ + "cartsnitch-common>=0.1.0", + "playwright>=1.49,<2.0", + "playwright-stealth>=1.0,<2.0", + "cryptography>=42.0,<44.0", + "fastapi>=0.115,<1.0", + "uvicorn[standard]>=0.30,<1.0", + "redis>=5.0,<6.0", + "pydantic>=2.0,<3.0", + "pydantic-settings>=2.0,<3.0", + "sqlalchemy[asyncio]>=2.0,<3.0", + "asyncpg>=0.29,<1.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0", + "pytest-asyncio>=0.23", + "ruff>=0.3", + "pytest-cov>=5.0", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/receiptwitness"] + +[tool.ruff] +target-version = "py312" +line-length = 100 + +[tool.ruff.lint] +select = ["E", "F", "I", "N", "W", "UP"] + +[tool.mypy] +python_version = "3.12" +strict = false +warn_return_any = true +warn_unused_ignores = true + +[[tool.mypy.overrides]] +module = "cartsnitch_common.*" +ignore_missing_imports = true + +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = ["tests"] diff --git a/renovate.json b/renovate.json new file mode 100644 index 0000000..833ba3b --- /dev/null +++ b/renovate.json @@ -0,0 +1,4 @@ +{ + "$schema": "https://docs.renovatebot.com/renovate-schema.json", + "extends": ["local>cartsnitch/.github:renovate-config"] +} diff --git a/src/receiptwitness/__init__.py b/src/receiptwitness/__init__.py new file mode 100644 index 0000000..6b17aab --- /dev/null +++ b/src/receiptwitness/__init__.py @@ -0,0 +1 @@ +"""ReceiptWitness — CartSnitch receipt ingestion service.""" diff --git a/src/receiptwitness/api/__init__.py b/src/receiptwitness/api/__init__.py new file mode 100644 index 0000000..74ded59 --- /dev/null +++ b/src/receiptwitness/api/__init__.py @@ -0,0 +1 @@ +"""Internal API for ReceiptWitness service.""" diff --git a/src/receiptwitness/api/routes.py b/src/receiptwitness/api/routes.py new file mode 100644 index 0000000..23cc109 --- /dev/null +++ b/src/receiptwitness/api/routes.py @@ -0,0 +1,10 @@ +"""Internal API routes for triggering scrapes and checking status.""" + +from fastapi import APIRouter + +router = APIRouter() + + +@router.get("/health") +async def health(): + return {"status": "ok", "service": "receiptwitness"} diff --git a/src/receiptwitness/config.py b/src/receiptwitness/config.py new file mode 100644 index 0000000..1341f3f --- /dev/null +++ b/src/receiptwitness/config.py @@ -0,0 +1,26 @@ +"""Service-specific configuration for ReceiptWitness.""" + +from pydantic_settings import BaseSettings + + +class ReceiptWitnessSettings(BaseSettings): + model_config = {"env_prefix": "RW_"} + + # Inherited from cartsnitch-common + database_url: str = "postgresql+asyncpg://cartsnitch:cartsnitch@localhost:5432/cartsnitch" + redis_url: str = "redis://localhost:6379/0" + + # Session encryption + session_encryption_key: str = "" + + # Scraping defaults + scrape_interval_seconds: int = 86400 # 24 hours + min_request_delay_ms: int = 1000 + max_request_delay_ms: int = 5000 + + # Playwright + headless: bool = True + browser_timeout_ms: int = 60000 + + +settings = ReceiptWitnessSettings() diff --git a/src/receiptwitness/events.py b/src/receiptwitness/events.py new file mode 100644 index 0000000..3d75614 --- /dev/null +++ b/src/receiptwitness/events.py @@ -0,0 +1,75 @@ +"""Publish receipt ingestion events to Redis/DragonflyDB pub/sub.""" + +import json +import logging +from datetime import UTC, datetime +from decimal import Decimal + +import redis.asyncio as aioredis + +from receiptwitness.config import settings + +logger = logging.getLogger(__name__) + +CHANNEL_RECEIPTS_INGESTED = "cartsnitch.receipts.ingested" + +# Module-level connection pool — shared across all publish calls +_pool: aioredis.ConnectionPool | None = None + + +class _DecimalEncoder(json.JSONEncoder): + def default(self, o): + if isinstance(o, Decimal): + return float(o) + return super().default(o) + + +def _get_pool() -> aioredis.ConnectionPool: + """Get or create the shared Redis connection pool.""" + global _pool + if _pool is None: + _pool = aioredis.ConnectionPool.from_url( + settings.redis_url, decode_responses=True, max_connections=10 + ) + return _pool + + +async def get_redis_client() -> aioredis.Redis: + """Create an async Redis/DragonflyDB client with connection pooling.""" + return aioredis.Redis(connection_pool=_get_pool()) + + +async def publish_receipt_ingested( + user_id: str, + store_slug: str, + purchase_id: str, + purchase_date: str, + item_count: int, + total: Decimal | float, +) -> None: + """Publish a cartsnitch.receipts.ingested event after successful ingestion.""" + event = { + "event_type": CHANNEL_RECEIPTS_INGESTED, + "timestamp": datetime.now(UTC).isoformat(), + "service": "receiptwitness", + "payload": { + "user_id": user_id, + "store_slug": store_slug, + "purchase_id": purchase_id, + "purchase_date": purchase_date, + "item_count": item_count, + "total": float(total) if isinstance(total, Decimal) else total, + }, + } + + try: + client = await get_redis_client() + await client.publish(CHANNEL_RECEIPTS_INGESTED, json.dumps(event, cls=_DecimalEncoder)) + logger.info( + "Published %s event for purchase %s", + CHANNEL_RECEIPTS_INGESTED, + purchase_id, + ) + except aioredis.ConnectionError: + logger.error("Failed to publish event — Redis/DragonflyDB connection error") + raise diff --git a/src/receiptwitness/main.py b/src/receiptwitness/main.py new file mode 100644 index 0000000..55cda42 --- /dev/null +++ b/src/receiptwitness/main.py @@ -0,0 +1,8 @@ +"""FastAPI app entrypoint for ReceiptWitness.""" + +from fastapi import FastAPI + +from receiptwitness.api.routes import router + +app = FastAPI(title="ReceiptWitness", version="0.1.0") +app.include_router(router) diff --git a/src/receiptwitness/parsers/__init__.py b/src/receiptwitness/parsers/__init__.py new file mode 100644 index 0000000..2b56ce8 --- /dev/null +++ b/src/receiptwitness/parsers/__init__.py @@ -0,0 +1 @@ +"""Receipt parsers for each retailer.""" diff --git a/src/receiptwitness/parsers/kroger.py b/src/receiptwitness/parsers/kroger.py new file mode 100644 index 0000000..13e5a20 --- /dev/null +++ b/src/receiptwitness/parsers/kroger.py @@ -0,0 +1,148 @@ +"""Kroger receipt parser. + +Transforms raw Kroger receipt JSON into the common PurchaseCreate schema. +Kroger receipt data uses different field names than Meijer — this parser +handles Kroger-specific naming conventions and receipt structure. +""" + +import logging +from decimal import Decimal, InvalidOperation + +from receiptwitness.scrapers.base import RawReceipt + +logger = logging.getLogger(__name__) + + +def _to_decimal(value, default: str = "0") -> Decimal: + """Safely convert a value to Decimal.""" + if value is None: + return Decimal(default) + try: + return Decimal(str(value)) + except (InvalidOperation, ValueError, TypeError): + return Decimal(default) + + +def _parse_item(item: dict) -> dict: + """Parse a single line item from a Kroger receipt. + + Kroger items typically include fields like: + - description / itemDescription / productName + - upc / krogerProductId + - quantity / qty + - basePrice / unitPrice / price + - totalPrice / extendedAmount / lineTotal + - regularPrice / originalPrice + - salePrice / promoPrice + - couponAmount / couponSavings + - loyaltyDiscount / fuelPointsDiscount / plusCardSavings + - department / category / aisle + """ + description = ( + item.get("description") + or item.get("itemDescription") + or item.get("productName") + or item.get("name") + or "UNKNOWN ITEM" + ) + + quantity = _to_decimal(item.get("quantity", item.get("qty", item.get("quantitySold", 1))), "1") + unit_price = _to_decimal(item.get("basePrice", item.get("unitPrice", item.get("price", 0)))) + extended_price = _to_decimal( + item.get("totalPrice", item.get("extendedAmount", item.get("lineTotal"))) + ) + + # Compute extended_price if not provided + if extended_price == Decimal("0") and unit_price != Decimal("0"): + extended_price = unit_price * quantity + + regular_price = item.get("regularPrice", item.get("originalPrice")) + sale_price = item.get("salePrice", item.get("promoPrice")) + coupon_discount = item.get( + "couponAmount", item.get("couponSavings", item.get("couponDiscount")) + ) + loyalty_discount = item.get( + "plusCardSavings", + item.get("loyaltyDiscount", item.get("fuelPointsDiscount")), + ) + + # UPC handling — Kroger may use krogerProductId or upc + upc = item.get("upc", item.get("UPC", item.get("krogerProductId"))) + if upc: + upc = str(upc).strip().lstrip("0") or None + + category = item.get("department", item.get("category", item.get("aisle"))) + + # Weight info for produce/deli items + weight = item.get("weight", item.get("netWeight")) + extra = {} + if weight is not None: + extra["weight"] = str(weight) + weight_uom = item.get("weightUom", item.get("unitOfMeasure")) + if weight_uom: + extra["weight_uom"] = weight_uom + + result = { + "product_name_raw": description.strip(), + "upc": upc, + "quantity": quantity, + "unit_price": unit_price, + "extended_price": extended_price, + "regular_price": (_to_decimal(regular_price) if regular_price is not None else None), + "sale_price": (_to_decimal(sale_price) if sale_price is not None else None), + "coupon_discount": (_to_decimal(coupon_discount) if coupon_discount is not None else None), + "loyalty_discount": ( + _to_decimal(loyalty_discount) if loyalty_discount is not None else None + ), + "category_raw": category.strip() if category else None, + } + + return result + + +def parse_kroger_receipt(raw: RawReceipt) -> dict: + """Parse a RawReceipt from Kroger into a PurchaseCreate-compatible dict.""" + data = raw.raw_data + detail = data.get("detail", {}) + + # Parse items — Kroger uses "items" or "lineItems" or "receiptItems" + raw_items = detail.get("items", detail.get("lineItems", detail.get("receiptItems", []))) + items = [] + for raw_item in raw_items: + # Skip voided / returned items + if raw_item.get("voided") or raw_item.get("status") in ( + "VOIDED", + "RETURNED", + ): + logger.debug("Skipping voided/returned item: %s", raw_item.get("description")) + continue + if raw_item.get("returnFlag") or raw_item.get("isReturn"): + logger.debug("Skipping returned item: %s", raw_item.get("description")) + continue + items.append(_parse_item(raw_item)) + + # Parse totals — Kroger uses various field names + total = _to_decimal( + detail.get( + "total", + data.get("total", data.get("orderTotal", data.get("grandTotal", 0))), + ) + ) + subtotal = detail.get("subtotal", data.get("subtotal", data.get("subTotal"))) + tax = detail.get("tax", data.get("tax", data.get("salesTax"))) + savings = detail.get( + "totalSavings", + data.get("savings", data.get("totalDiscount", data.get("youSaved"))), + ) + + return { + "receipt_id": raw.receipt_id, + "purchase_date": raw.purchase_date, + "total": total, + "subtotal": _to_decimal(subtotal) if subtotal is not None else None, + "tax": _to_decimal(tax) if tax is not None else None, + "savings_total": _to_decimal(savings) if savings is not None else None, + "source_url": raw.source_url, + "raw_data": data, + "items": items, + } diff --git a/src/receiptwitness/parsers/meijer.py b/src/receiptwitness/parsers/meijer.py new file mode 100644 index 0000000..d1960d0 --- /dev/null +++ b/src/receiptwitness/parsers/meijer.py @@ -0,0 +1,138 @@ +"""Parse raw Meijer mPerks receipt data into PurchaseCreate-compatible dicts. + +The mPerks receipt JSON structure (reverse-engineered from their SPA) +typically looks like: + +Transaction listing: +{ + "transactions": [ + { + "transactionId": "12345", + "transactionDate": "2026-03-10T14:30:00Z", + "storeNumber": "123", + "total": 87.42, + "savings": 12.50 + } + ] +} + +Receipt detail: +{ + "receiptId": "12345", + "items": [ + { + "description": "ORGANIC BANANAS", + "upc": "0000000004011", + "quantity": 1, + "price": 0.69, + "extendedPrice": 0.69, + "regularPrice": 0.79, + "salePrice": 0.69, + "couponDiscount": 0.0, + "mperksDiscount": 0.10, + "category": "PRODUCE" + } + ], + "subtotal": 74.92, + "tax": 5.24, + "total": 87.42, + "totalSavings": 12.50 +} +""" + +import logging +from decimal import Decimal, InvalidOperation + +from receiptwitness.scrapers.base import RawReceipt + +logger = logging.getLogger(__name__) + + +def _to_decimal(value, default: str = "0") -> Decimal: + """Safely convert a value to Decimal.""" + if value is None: + return Decimal(default) + try: + return Decimal(str(value)) + except (InvalidOperation, ValueError, TypeError): + return Decimal(default) + + +def _parse_item(item: dict) -> dict: + """Parse a single line item from Meijer receipt detail.""" + description = ( + item.get("description") or item.get("itemDescription") or item.get("name") or "UNKNOWN ITEM" + ) + + quantity = _to_decimal(item.get("quantity", item.get("qty", 1)), "1") + unit_price = _to_decimal(item.get("price", item.get("unitPrice", 0))) + extended_price = _to_decimal(item.get("extendedPrice", item.get("totalPrice"))) + + # If extended_price wasn't provided, compute it + if extended_price == Decimal("0") and unit_price != Decimal("0"): + extended_price = unit_price * quantity + + regular_price = item.get("regularPrice") + sale_price = item.get("salePrice") + coupon_discount = item.get("couponDiscount", item.get("couponSavings")) + loyalty_discount = item.get("mperksDiscount", item.get("loyaltyDiscount")) + + upc = item.get("upc", item.get("UPC")) + if upc: + upc = str(upc).strip().lstrip("0") or None + + category = item.get("category", item.get("departmentDescription")) + + return { + "product_name_raw": description.strip(), + "upc": upc, + "quantity": quantity, + "unit_price": unit_price, + "extended_price": extended_price, + "regular_price": _to_decimal(regular_price) if regular_price is not None else None, + "sale_price": _to_decimal(sale_price) if sale_price is not None else None, + "coupon_discount": (_to_decimal(coupon_discount) if coupon_discount is not None else None), + "loyalty_discount": ( + _to_decimal(loyalty_discount) if loyalty_discount is not None else None + ), + "category_raw": category.strip() if category else None, + } + + +def parse_meijer_receipt(raw: RawReceipt) -> dict: + """Parse a RawReceipt from Meijer into a PurchaseCreate-compatible dict. + + Returns a dict with keys matching PurchaseCreate schema fields. + The caller is responsible for setting store_id and store_location_id + from the store registry. + """ + data = raw.raw_data + detail = data.get("detail", {}) + + # Parse items from the detail response + raw_items = detail.get("items", detail.get("lineItems", [])) + items = [] + for raw_item in raw_items: + # Skip voided items + if raw_item.get("voided") or raw_item.get("status") == "VOIDED": + logger.debug("Skipping voided item: %s", raw_item.get("description")) + continue + items.append(_parse_item(raw_item)) + + # Parse totals + total = _to_decimal(detail.get("total", data.get("total", data.get("transactionTotal", 0)))) + subtotal = detail.get("subtotal", data.get("subtotal")) + tax = detail.get("tax", data.get("tax")) + savings = detail.get("totalSavings", data.get("savings", data.get("totalDiscount"))) + + return { + "receipt_id": raw.receipt_id, + "purchase_date": raw.purchase_date, + "total": total, + "subtotal": _to_decimal(subtotal) if subtotal is not None else None, + "tax": _to_decimal(tax) if tax is not None else None, + "savings_total": _to_decimal(savings) if savings is not None else None, + "source_url": raw.source_url, + "raw_data": data, + "items": items, + } diff --git a/src/receiptwitness/parsers/target.py b/src/receiptwitness/parsers/target.py new file mode 100644 index 0000000..25b4204 --- /dev/null +++ b/src/receiptwitness/parsers/target.py @@ -0,0 +1,191 @@ +"""Target Circle receipt parser. + +Transforms raw Target in-store receipt JSON into the common PurchaseCreate schema. +Target receipt data includes Circle pricing, BOGO deals, and Circle rewards +discounts that need special handling. + +Target receipt detail structure (reverse-engineered from target.com SPA): + +{ + "orderId": "TGT-2026-0315-7890", + "items": [ + { + "description": "GOOD & GATHER WHOLE MILK GAL", + "tcin": "14767459", + "upc": "0085239100123", + "quantity": 1, + "unitPrice": 3.89, + "totalPrice": 3.89, + "regularPrice": 4.19, + "circlePrice": 3.89, + "couponDiscount": 0.0, + "circleRewardsDiscount": 0.30, + "promoDescription": "Circle offer: Save 30c", + "department": "GROCERY" + } + ], + "subtotal": 78.32, + "tax": 4.89, + "total": 83.21, + "totalSavings": 11.45 +} +""" + +import logging +from decimal import Decimal, InvalidOperation + +from receiptwitness.scrapers.base import RawReceipt + +logger = logging.getLogger(__name__) + + +def _to_decimal(value, default: str = "0") -> Decimal: + """Safely convert a value to Decimal.""" + if value is None: + return Decimal(default) + try: + return Decimal(str(value)) + except (InvalidOperation, ValueError, TypeError): + return Decimal(default) + + +def _parse_item(item: dict) -> dict: + """Parse a single line item from a Target receipt. + + Target items may include fields like: + - description / itemDescription / productName + - tcin (Target internal product ID) / upc / dpci + - quantity / qty + - unitPrice / price + - totalPrice / extendedPrice / lineTotal + - regularPrice / originalPrice + - circlePrice / salePrice / promoPrice + - couponDiscount / couponSavings + - circleRewardsDiscount / circleDiscount / loyaltyDiscount + - promoDescription / offerDescription (e.g. "BOGO 50% off", "Circle offer") + - department / category + """ + description = ( + item.get("description") + or item.get("itemDescription") + or item.get("productName") + or item.get("name") + or "UNKNOWN ITEM" + ) + + quantity = _to_decimal(item.get("quantity", item.get("qty", item.get("quantitySold", 1))), "1") + unit_price = _to_decimal(item.get("unitPrice", item.get("price", item.get("basePrice", 0)))) + extended_price = _to_decimal( + item.get("totalPrice", item.get("extendedPrice", item.get("lineTotal"))) + ) + + # Compute extended_price if not provided + if extended_price == Decimal("0") and unit_price != Decimal("0"): + extended_price = unit_price * quantity + + regular_price = item.get("regularPrice", item.get("originalPrice")) + # Target Circle pricing — circlePrice takes precedence over generic salePrice + sale_price = item.get("circlePrice", item.get("salePrice", item.get("promoPrice"))) + coupon_discount = item.get( + "couponDiscount", item.get("couponSavings", item.get("couponAmount")) + ) + # Circle rewards / loyalty discount + loyalty_discount = item.get( + "circleRewardsDiscount", + item.get("circleDiscount", item.get("loyaltyDiscount")), + ) + + # UPC handling — Target may use tcin, upc, or dpci + upc = item.get("upc", item.get("UPC")) + if upc: + upc = str(upc).strip().lstrip("0") or None + + # Target also has TCIN (Target.com Item Number) and DPCI (Department/Class/Item) + tcin = item.get("tcin", item.get("TCIN")) + dpci = item.get("dpci", item.get("DPCI")) + + category = item.get("department", item.get("category")) + + # Capture promo/deal description for BOGO and Circle offers + promo_description = item.get("promoDescription", item.get("offerDescription")) + + # Weight info for produce/deli items + weight = item.get("weight", item.get("netWeight")) + extra: dict = {} + if weight is not None: + extra["weight"] = str(weight) + weight_uom = item.get("weightUom", item.get("unitOfMeasure")) + if weight_uom: + extra["weight_uom"] = weight_uom + if tcin: + extra["tcin"] = str(tcin) + if dpci: + extra["dpci"] = str(dpci) + if promo_description: + extra["promo_description"] = promo_description + + result: dict = { + "product_name_raw": description.strip(), + "upc": upc, + "quantity": quantity, + "unit_price": unit_price, + "extended_price": extended_price, + "regular_price": _to_decimal(regular_price) if regular_price is not None else None, + "sale_price": _to_decimal(sale_price) if sale_price is not None else None, + "coupon_discount": (_to_decimal(coupon_discount) if coupon_discount is not None else None), + "loyalty_discount": ( + _to_decimal(loyalty_discount) if loyalty_discount is not None else None + ), + "category_raw": category.strip() if category else None, + } + + return result + + +def parse_target_receipt(raw: RawReceipt) -> dict: + """Parse a RawReceipt from Target into a PurchaseCreate-compatible dict.""" + data = raw.raw_data + detail = data.get("detail", {}) + + # Parse items — Target uses "items" or "lineItems" + raw_items = detail.get("items", detail.get("lineItems", [])) + items = [] + for raw_item in raw_items: + # Skip voided / returned items + if raw_item.get("voided") or raw_item.get("status") in ( + "VOIDED", + "RETURNED", + "CANCELLED", + ): + logger.debug("Skipping voided/returned item: %s", raw_item.get("description")) + continue + if raw_item.get("returnFlag") or raw_item.get("isReturn"): + logger.debug("Skipping returned item: %s", raw_item.get("description")) + continue + items.append(_parse_item(raw_item)) + + # Parse totals + total = _to_decimal( + detail.get( + "total", + data.get("total", data.get("orderTotal", data.get("grandTotal", 0))), + ) + ) + subtotal = detail.get("subtotal", data.get("subtotal", data.get("subTotal"))) + tax = detail.get("tax", data.get("tax", data.get("salesTax"))) + savings = detail.get( + "totalSavings", + data.get("savings", data.get("totalDiscount", data.get("circleSavings"))), + ) + + return { + "receipt_id": raw.receipt_id, + "purchase_date": raw.purchase_date, + "total": total, + "subtotal": _to_decimal(subtotal) if subtotal is not None else None, + "tax": _to_decimal(tax) if tax is not None else None, + "savings_total": _to_decimal(savings) if savings is not None else None, + "source_url": raw.source_url, + "raw_data": data, + "items": items, + } diff --git a/src/receiptwitness/pipeline/__init__.py b/src/receiptwitness/pipeline/__init__.py new file mode 100644 index 0000000..e590387 --- /dev/null +++ b/src/receiptwitness/pipeline/__init__.py @@ -0,0 +1,30 @@ +"""Receipt & product matching pipeline — receipt normalization and product dedup.""" + +from receiptwitness.pipeline.matching import ( + ConfidenceLevel, + ProductMatcher, + match_purchase_item, +) +from receiptwitness.pipeline.normalization import ( + MatchMethod, + MatchResult, + clean_name, + extract_size_info, + jaccard_similarity, + normalize_product, +) +from receiptwitness.pipeline.receipt import normalize_receipt, parse_meijer_item + +__all__ = [ + "ConfidenceLevel", + "MatchMethod", + "MatchResult", + "ProductMatcher", + "clean_name", + "extract_size_info", + "jaccard_similarity", + "match_purchase_item", + "normalize_product", + "normalize_receipt", + "parse_meijer_item", +] diff --git a/src/receiptwitness/pipeline/matching.py b/src/receiptwitness/pipeline/matching.py new file mode 100644 index 0000000..7e71039 --- /dev/null +++ b/src/receiptwitness/pipeline/matching.py @@ -0,0 +1,136 @@ +"""Product matching & dedup — UPC primary, fuzzy name fallback, confidence scoring. + +Wraps the Phase 1 normalization module with confidence-level classification +and batch matching for purchase ingestion. +""" + +import uuid +from dataclasses import dataclass + +from cartsnitch_common.constants import MatchConfidence +from cartsnitch_common.models.product import NormalizedProduct +from cartsnitch_common.schemas.purchase import PurchaseItemCreate +from sqlalchemy.orm import Session + +from receiptwitness.pipeline.normalization import ( + MatchMethod, + MatchResult, + extract_size_info, + normalize_product, +) + +# Re-export for convenience +ConfidenceLevel = MatchConfidence + + +@dataclass(frozen=True) +class MatchOutcome: + """Result of matching a single purchase item to a normalized product.""" + + item_index: int + match: MatchResult | None + confidence_level: MatchConfidence + created_new: bool = False + + +def classify_confidence(score: float, method: MatchMethod) -> MatchConfidence: + """Classify a match score into high/medium/low confidence.""" + if method == MatchMethod.UPC: + return MatchConfidence.HIGH + # Name-based matching thresholds + if score >= 0.8: + return MatchConfidence.HIGH + if score >= 0.5: + return MatchConfidence.MEDIUM + return MatchConfidence.LOW + + +def _create_product_from_item( + session: Session, + item: PurchaseItemCreate, +) -> NormalizedProduct: + """Create a new NormalizedProduct from a purchase item that had no match.""" + size_info = extract_size_info(item.product_name_raw) + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name=item.product_name_raw, + size=size_info[0] if size_info else None, + size_unit=size_info[1] if size_info else None, + upc_variants=[item.upc] if item.upc else [], + ) + session.add(product) + session.flush() + return product + + +class ProductMatcher: + """Batch product matcher for purchase ingestion. + + Usage: + matcher = ProductMatcher(session) + outcomes = matcher.match_items(items) + """ + + def __init__( + self, + session: Session, + name_threshold: float = 0.4, + auto_create: bool = True, + ): + self.session = session + self.name_threshold = name_threshold + self.auto_create = auto_create + + def match_single( + self, + item: PurchaseItemCreate, + ) -> tuple[NormalizedProduct | None, MatchResult | None, MatchConfidence]: + """Match a single purchase item to a normalized product. + + Returns (product, match_result, confidence_level). + If auto_create is True and no match found, creates a new product. + """ + result = normalize_product( + self.session, + item.product_name_raw, + upc=item.upc, + name_threshold=self.name_threshold, + ) + + if result: + confidence = classify_confidence(result.confidence, result.method) + return result.product, result, confidence + + if self.auto_create: + product = _create_product_from_item(self.session, item) + return product, None, MatchConfidence.LOW + + return None, None, MatchConfidence.LOW + + def match_items(self, items: list[PurchaseItemCreate]) -> list[MatchOutcome]: + """Match a batch of purchase items. Returns outcomes in order.""" + outcomes: list[MatchOutcome] = [] + for idx, item in enumerate(items): + product, result, confidence = self.match_single(item) + created = result is None and product is not None + outcomes.append( + MatchOutcome( + item_index=idx, + match=result, + confidence_level=confidence, + created_new=created, + ) + ) + return outcomes + + +def match_purchase_item( + session: Session, + item: PurchaseItemCreate, + name_threshold: float = 0.4, + auto_create: bool = True, +) -> tuple[NormalizedProduct | None, MatchConfidence]: + """Convenience function: match a single item, return (product, confidence).""" + matcher = ProductMatcher(session, name_threshold=name_threshold, auto_create=auto_create) + product, _, confidence = matcher.match_single(item) + return product, confidence diff --git a/src/receiptwitness/pipeline/normalization.py b/src/receiptwitness/pipeline/normalization.py new file mode 100644 index 0000000..c1fade9 --- /dev/null +++ b/src/receiptwitness/pipeline/normalization.py @@ -0,0 +1,155 @@ +"""Product normalization — Phase 1: UPC matching + fuzzy name matching. + +Matches products across retailers by: +1. Exact UPC match (highest confidence) +2. Fuzzy name matching via token-based Jaccard similarity (lower confidence) +""" + +import re +from dataclasses import dataclass +from enum import StrEnum + +from cartsnitch_common.models.product import NormalizedProduct +from sqlalchemy import select +from sqlalchemy.orm import Session + + +class MatchMethod(StrEnum): + """How a product match was determined.""" + + UPC = "upc" + NAME = "name" + + +@dataclass(frozen=True) +class MatchResult: + """Result of a product normalization attempt.""" + + product: NormalizedProduct + confidence: float + method: MatchMethod + + +# Noise words stripped during name cleaning +_NOISE_WORDS = frozenset( + { + "the", + "a", + "an", + "and", + "or", + "of", + "with", + "in", + "for", + "to", + "brand", + "original", + "classic", + "new", + "improved", + } +) + +# Regex for extracting size info (e.g., "16 oz", "1.5 lb", "12 ct") +_SIZE_PATTERN = re.compile( + r"(\d+(?:\.\d+)?)\s*(oz|fl\s*oz|lb|lbs|g|kg|ml|l|ct|pk|count|pack)\b", + re.IGNORECASE, +) + + +def clean_name(name: str) -> str: + """Normalize a product name for comparison. + + - Lowercase + - Remove size info (e.g., "16 oz") + - Strip noise words + - Collapse whitespace + """ + cleaned = name.lower() + cleaned = _SIZE_PATTERN.sub("", cleaned) + cleaned = re.sub(r"[^\w\s]", " ", cleaned) + tokens = cleaned.split() + tokens = [t for t in tokens if t not in _NOISE_WORDS] + return " ".join(tokens) + + +def extract_size_info(name: str) -> tuple[str, str] | None: + """Extract (size, unit) from a product name, if present.""" + match = _SIZE_PATTERN.search(name) + if match: + return match.group(1), match.group(2).lower().replace(" ", "_") + return None + + +def jaccard_similarity(a: str, b: str) -> float: + """Token-based Jaccard similarity between two cleaned names.""" + tokens_a = set(a.split()) + tokens_b = set(b.split()) + if not tokens_a or not tokens_b: + return 0.0 + intersection = tokens_a & tokens_b + union = tokens_a | tokens_b + return len(intersection) / len(union) + + +def match_by_upc(session: Session, upc: str) -> MatchResult | None: + """Find a normalized product by exact UPC match. + + Loads products with upc_variants and checks membership in Python + for cross-database compatibility (works on both PostgreSQL and SQLite). + """ + # TODO: Use PostgreSQL JSON containment query (@>) for production. + # Current approach loads all products into memory — acceptable for tests + # and small datasets, but will not scale. + stmt = select(NormalizedProduct).where(NormalizedProduct.upc_variants.is_not(None)) + products = session.execute(stmt).scalars().all() + for product in products: + if product.upc_variants and upc in product.upc_variants: + return MatchResult(product=product, confidence=1.0, method=MatchMethod.UPC) + return None + + +def match_by_name( + session: Session, + name: str, + threshold: float = 0.5, +) -> MatchResult | None: + """Find the best normalized product by fuzzy name matching. + + Loads all normalized products and computes Jaccard similarity. + Returns the best match above the threshold, or None. + """ + # TODO: Use pg_trgm similarity index for production. + # Current approach loads all products into memory — acceptable for tests + # and small datasets, but will not scale. + cleaned = clean_name(name) + stmt = select(NormalizedProduct) + products = session.execute(stmt).scalars().all() + + best_match: NormalizedProduct | None = None + best_score = 0.0 + + for product in products: + score = jaccard_similarity(cleaned, clean_name(product.canonical_name)) + if score > best_score and score >= threshold: + best_score = score + best_match = product + + if best_match: + return MatchResult(product=best_match, confidence=best_score, method=MatchMethod.NAME) + return None + + +def normalize_product( + session: Session, + name: str, + upc: str | None = None, + name_threshold: float = 0.5, +) -> MatchResult | None: + """Full normalization pipeline: UPC first, then fuzzy name fallback.""" + if upc: + result = match_by_upc(session, upc) + if result: + return result + return match_by_name(session, name, threshold=name_threshold) diff --git a/src/receiptwitness/pipeline/receipt.py b/src/receiptwitness/pipeline/receipt.py new file mode 100644 index 0000000..7d3e863 --- /dev/null +++ b/src/receiptwitness/pipeline/receipt.py @@ -0,0 +1,144 @@ +"""Receipt normalization — parse raw Meijer scraper output into purchase records. + +Maps raw receipt fields, cleans product names, extracts quantities/units. +""" + +import re +from datetime import date +from decimal import Decimal, InvalidOperation + +from cartsnitch_common.schemas.purchase import PurchaseCreate, PurchaseItemCreate + + +def _clean_product_name(raw: str) -> str: + """Clean raw product name from scraper output.""" + cleaned = raw.strip() + # Remove leading/trailing non-alphanumeric chars + cleaned = re.sub(r"^\W+|\W+$", "", cleaned) + # Collapse internal whitespace + cleaned = re.sub(r"\s+", " ", cleaned) + return cleaned + + +def _safe_decimal( + value: str | float | int | Decimal | None, + default: Decimal = Decimal("0"), +) -> Decimal: + """Safely convert a value to Decimal.""" + if value is None: + return default + try: + return Decimal(str(value)) + except (InvalidOperation, ValueError): + return default + + +def parse_meijer_item(raw_item: dict) -> PurchaseItemCreate: + """Parse a single Meijer scraper line item into a PurchaseItemCreate. + + Expected raw_item keys (from Meijer scraper): + - description / name: product name + - upc / upcCode: UPC barcode + - quantity / qty: number of units + - unitPrice / price: per-unit price + - extendedPrice / totalPrice: line total + - regularPrice: shelf price before discounts + - salePrice: sale price if applicable + - couponAmount / couponDiscount: coupon savings + - loyaltyAmount / loyaltyDiscount: loyalty savings + - category / department: raw category + """ + name = raw_item.get("description") or raw_item.get("name") or "" + cleaned_name = _clean_product_name(name) + + upc = raw_item.get("upc") or raw_item.get("upcCode") + if upc: + upc = str(upc).strip().lstrip("0") or str(upc).strip() + + qty = _safe_decimal( + raw_item.get("quantity") or raw_item.get("qty"), + default=Decimal("1"), + ) + + unit_price = _safe_decimal(raw_item.get("unitPrice") or raw_item.get("price")) + extended = _safe_decimal(raw_item.get("extendedPrice") or raw_item.get("totalPrice")) + if extended == Decimal("0") and unit_price > 0: + extended = unit_price * qty + + regular = raw_item.get("regularPrice") + sale = raw_item.get("salePrice") + coupon = raw_item.get("couponAmount") or raw_item.get("couponDiscount") + loyalty = raw_item.get("loyaltyAmount") or raw_item.get("loyaltyDiscount") + category = raw_item.get("category") or raw_item.get("department") + + return PurchaseItemCreate( + product_name_raw=cleaned_name, + upc=upc, + quantity=qty, + unit_price=unit_price, + extended_price=extended, + regular_price=_safe_decimal(regular) if regular is not None else None, + sale_price=_safe_decimal(sale) if sale is not None else None, + coupon_discount=_safe_decimal(coupon) if coupon is not None else None, + loyalty_discount=_safe_decimal(loyalty) if loyalty is not None else None, + category_raw=str(category).strip() if category else None, + ) + + +def normalize_receipt( + raw_receipt: dict, + user_id: str, + store_id: str, +) -> PurchaseCreate: + """Parse a complete Meijer raw receipt into a PurchaseCreate. + + Expected raw_receipt keys: + - receiptId / receipt_id / id: unique receipt identifier + - date / purchaseDate / purchase_date: purchase date (YYYY-MM-DD or similar) + - total / totalAmount: receipt total + - subtotal: pre-tax subtotal + - tax / taxAmount: tax amount + - savings / totalSavings: total discount savings + - items: list of raw line item dicts + """ + import uuid + + receipt_id = str( + raw_receipt.get("receiptId") + or raw_receipt.get("receipt_id") + or raw_receipt.get("id") + or uuid.uuid4() + ) + + raw_date = ( + raw_receipt.get("date") + or raw_receipt.get("purchaseDate") + or raw_receipt.get("purchase_date") + ) + if isinstance(raw_date, str): + purchase_date = date.fromisoformat(raw_date[:10]) + elif isinstance(raw_date, date): + purchase_date = raw_date + else: + purchase_date = date.today() + + total = _safe_decimal(raw_receipt.get("total") or raw_receipt.get("totalAmount")) + subtotal = raw_receipt.get("subtotal") + tax = raw_receipt.get("tax") or raw_receipt.get("taxAmount") + savings = raw_receipt.get("savings") or raw_receipt.get("totalSavings") + + raw_items = raw_receipt.get("items") or [] + items = [parse_meijer_item(item) for item in raw_items] + + return PurchaseCreate( + user_id=uuid.UUID(user_id) if isinstance(user_id, str) else user_id, + store_id=uuid.UUID(store_id) if isinstance(store_id, str) else store_id, + receipt_id=receipt_id, + purchase_date=purchase_date, + total=total, + subtotal=_safe_decimal(subtotal) if subtotal is not None else None, + tax=_safe_decimal(tax) if tax is not None else None, + savings_total=_safe_decimal(savings) if savings is not None else None, + raw_data=raw_receipt, + items=items, + ) diff --git a/src/receiptwitness/scrapers/__init__.py b/src/receiptwitness/scrapers/__init__.py new file mode 100644 index 0000000..cfc8d9e --- /dev/null +++ b/src/receiptwitness/scrapers/__init__.py @@ -0,0 +1 @@ +"""Retailer scrapers.""" diff --git a/src/receiptwitness/scrapers/base.py b/src/receiptwitness/scrapers/base.py new file mode 100644 index 0000000..fd5fdc3 --- /dev/null +++ b/src/receiptwitness/scrapers/base.py @@ -0,0 +1,72 @@ +"""Abstract base scraper interface for all retailer scrapers.""" + +import asyncio +import random +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from datetime import datetime + +from receiptwitness.config import settings + + +@dataclass +class SessionData: + """Holds session cookies and metadata for a retailer login.""" + + cookies: list[dict] + user_agent: str + created_at: datetime + expires_at: datetime | None = None + extra: dict = field(default_factory=dict) + + +@dataclass +class RawReceipt: + """Raw receipt data before parsing.""" + + receipt_id: str + purchase_date: str + store_number: str | None = None + raw_data: dict = field(default_factory=dict) + source_url: str | None = None + + +class BaseScraper(ABC): + """All retailer scrapers implement this interface. + + Provides common functionality: human-like delays, rate limiting guards, + and the abstract methods each retailer scraper must implement. + """ + + @abstractmethod + async def login(self, username: str, password: str) -> SessionData: + """Authenticate with the retailer portal and return session data.""" + ... + + @abstractmethod + async def check_session(self, session: SessionData) -> bool: + """Verify if an existing session is still valid.""" + ... + + @abstractmethod + async def scrape_receipts( + self, session: SessionData, since: datetime | None = None + ) -> list[RawReceipt]: + """Scrape receipt data from the retailer portal.""" + ... + + @abstractmethod + def parse_receipt(self, raw: RawReceipt) -> dict: + """Parse a raw receipt into structured data. + + Returns a dict with keys matching PurchaseCreate schema fields, + including an 'items' list matching PurchaseItemCreate fields. + """ + ... + + async def human_delay(self, min_ms: int | None = None, max_ms: int | None = None) -> None: + """Sleep for a randomized human-like interval.""" + lo = min_ms or settings.min_request_delay_ms + hi = max_ms or settings.max_request_delay_ms + delay = random.randint(lo, hi) / 1000.0 + await asyncio.sleep(delay) diff --git a/src/receiptwitness/scrapers/kroger.py b/src/receiptwitness/scrapers/kroger.py new file mode 100644 index 0000000..a7993af --- /dev/null +++ b/src/receiptwitness/scrapers/kroger.py @@ -0,0 +1,344 @@ +"""Kroger loyalty portal scraper using Playwright. + +Kroger uses Akamai Bot Manager for aggressive headless browser detection. +This scraper uses enhanced stealth measures including playwright-stealth, +realistic fingerprinting, and human-like interaction pacing. +""" + +import logging +from datetime import UTC, datetime, timedelta +from typing import cast + +from playwright.async_api import BrowserContext, Page, Playwright, async_playwright + +from receiptwitness.config import settings +from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData + +logger = logging.getLogger(__name__) + +# Kroger endpoints +KROGER_BASE = "https://www.kroger.com" +KROGER_LOGIN_PAGE = f"{KROGER_BASE}/signin" +KROGER_PURCHASE_HISTORY = f"{KROGER_BASE}/mypurchases" +KROGER_RECEIPT_API = f"{KROGER_BASE}/atlas/v1/purchase-history/api" +KROGER_RECEIPT_DETAIL_API = f"{KROGER_BASE}/atlas/v1/receipt/api" +KROGER_ACCOUNT_PAGE = f"{KROGER_BASE}/account/dashboard" + +# Realistic browser fingerprint — Chrome on Windows (matches Kroger's typical audience) +DEFAULT_USER_AGENT = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/131.0.0.0 Safari/537.36" +) +DEFAULT_VIEWPORT = {"width": 1920, "height": 1080} +DEFAULT_LOCALE = "en-US" +DEFAULT_TIMEZONE = "America/New_York" + + +class KrogerScraper(BaseScraper): + """Scraper for Kroger loyalty purchase history. + + Kroger uses Akamai Bot Manager which aggressively detects headless + browsers. This scraper employs enhanced stealth measures: + - Masks webdriver/automation signals + - Sets realistic browser fingerprint + - Uses human-like interaction pacing + - Preserves browser context across sessions + """ + + async def _create_stealth_context( + self, playwright_instance: Playwright, cookies: list[dict] | None = None + ) -> BrowserContext: + """Create a browser context with enhanced stealth for Akamai evasion.""" + browser = await playwright_instance.chromium.launch( + headless=settings.headless, + args=[ + "--disable-blink-features=AutomationControlled", + "--no-sandbox", + "--disable-dev-shm-usage", + "--disable-infobars", + "--window-size=1920,1080", + ], + ) + context = await browser.new_context( + user_agent=DEFAULT_USER_AGENT, + viewport=DEFAULT_VIEWPORT, # type: ignore[arg-type] + locale=DEFAULT_LOCALE, + timezone_id=DEFAULT_TIMEZONE, + java_script_enabled=True, + bypass_csp=False, + color_scheme="light", + has_touch=False, + ) + + # Enhanced stealth script targeting Akamai Bot Manager detection vectors + await context.add_init_script( + """ + // Mask webdriver flag + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + + // Chrome runtime object + window.chrome = { + runtime: {}, + loadTimes: function() {}, + csi: function() {}, + app: { isInstalled: false } + }; + + // Realistic plugin array + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5] + }); + + // Languages + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'] + }); + + // Platform + Object.defineProperty(navigator, 'platform', { + get: () => 'Win32' + }); + + // Hardware concurrency + Object.defineProperty(navigator, 'hardwareConcurrency', { + get: () => 8 + }); + + // Device memory + Object.defineProperty(navigator, 'deviceMemory', { + get: () => 8 + }); + + // Permissions query override (Akamai checks this) + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => + parameters.name === 'notifications' + ? Promise.resolve({ state: Notification.permission }) + : originalQuery(parameters); + + // WebGL vendor/renderer (avoid "Google Inc." / "ANGLE" tells) + const getParameter = WebGLRenderingContext.prototype.getParameter; + WebGLRenderingContext.prototype.getParameter = function(parameter) { + if (parameter === 37445) return 'Intel Inc.'; + if (parameter === 37446) return 'Intel Iris OpenGL Engine'; + return getParameter.call(this, parameter); + }; + """ + ) + + if cookies: + await context.add_cookies(cookies) # type: ignore[arg-type] + + return cast(BrowserContext, context) + + async def login(self, username: str, password: str) -> SessionData: + """Log in to Kroger and capture session cookies.""" + async with async_playwright() as p: + context = await self._create_stealth_context(p) + page = await context.new_page() + try: + return await self._perform_login(page, context, username, password) + finally: + if context.browser: + await context.browser.close() + + async def _perform_login( + self, page: Page, context: BrowserContext, username: str, password: str + ) -> SessionData: + """Execute the Kroger login flow.""" + logger.info("Navigating to Kroger sign-in page") + await page.goto(KROGER_LOGIN_PAGE, wait_until="networkidle") + await self.human_delay(2000, 4000) + + # Kroger login form — email/username field + email_input = page.locator( + 'input[id="SignIn-emailInput"], ' + 'input[name="email"], ' + 'input[type="email"], ' + 'input[data-testid="SignIn-emailInput"]' + ) + await email_input.wait_for(state="visible", timeout=settings.browser_timeout_ms) + await email_input.click() + await self.human_delay(300, 700) + await email_input.fill(username) + await self.human_delay(800, 1500) + + # Password field + password_input = page.locator( + 'input[id="SignIn-passwordInput"], ' + 'input[name="password"], ' + 'input[type="password"], ' + 'input[data-testid="SignIn-passwordInput"]' + ) + await password_input.wait_for(state="visible", timeout=settings.browser_timeout_ms) + await password_input.click() + await self.human_delay(300, 700) + await password_input.fill(password) + await self.human_delay(1000, 2000) + + # Sign-in button + sign_in_btn = page.locator( + 'button[id="SignIn-submitButton"], ' + 'button[data-testid="SignIn-submitButton"], ' + 'button[type="submit"]:has-text("Sign In")' + ) + await sign_in_btn.click() + + # Wait for redirect away from sign-in page + await page.wait_for_url( + lambda url: "signin" not in url.lower(), + timeout=settings.browser_timeout_ms, + ) + await self.human_delay(1500, 3000) + + # Capture cookies + raw_cookies = await context.cookies() + cookies = [dict(c) for c in raw_cookies] + now = datetime.now(UTC) + + logger.info("Kroger login successful, captured %d cookies", len(cookies)) + return SessionData( + cookies=cookies, + user_agent=DEFAULT_USER_AGENT, + created_at=now, + expires_at=now + timedelta(hours=2), + extra={"retailer": "kroger"}, + ) + + async def check_session(self, session: SessionData) -> bool: + """Check if the Kroger session is still valid.""" + if session.expires_at and datetime.now(UTC) > session.expires_at: + logger.info("Kroger session expired based on timestamp") + return False + + async with async_playwright() as p: + context = await self._create_stealth_context(p, cookies=session.cookies) + page = await context.new_page() + try: + response = await page.goto(KROGER_ACCOUNT_PAGE, wait_until="networkidle") + current_url = page.url.lower() + is_valid = "signin" not in current_url and response is not None and response.ok + logger.info("Kroger session check: valid=%s (url=%s)", is_valid, page.url) + return is_valid + except Exception: + logger.exception("Kroger session check failed") + return False + finally: + if context.browser: + await context.browser.close() + + async def scrape_receipts( + self, session: SessionData, since: datetime | None = None + ) -> list[RawReceipt]: + """Scrape purchase history from Kroger.""" + async with async_playwright() as p: + context = await self._create_stealth_context(p, cookies=session.cookies) + page = await context.new_page() + try: + return await self._fetch_receipts(page, since) + finally: + if context.browser: + await context.browser.close() + + async def _fetch_receipts(self, page: Page, since: datetime | None) -> list[RawReceipt]: + """Fetch receipt list and details from Kroger purchase history.""" + # Navigate to purchase history to establish context + await page.goto(KROGER_PURCHASE_HISTORY, wait_until="networkidle") + await self.human_delay(1500, 3000) + + receipts: list[RawReceipt] = [] + + # Kroger purchase history API endpoint + api_response = await page.request.get(KROGER_RECEIPT_API) + if not api_response.ok: + logger.warning( + "Kroger purchase history request failed: %d %s", + api_response.status, + api_response.status_text, + ) + return [] + + response = await api_response.json() + if not isinstance(response, dict): + logger.warning("Unexpected purchase history response type: %s", type(response)) + return [] + + # Handle Kroger's response structure + orders = response.get("orders", response.get("purchases", [])) + if not isinstance(orders, list): + logger.warning("No orders found in Kroger purchase history response") + return [] + + logger.info("Found %d orders in Kroger purchase history", len(orders)) + + for order in orders: + raw_id = order.get("orderId") or order.get("receiptId") or order.get("id") or "" + order_id = str(raw_id) + purchase_date = order.get( + "purchaseDate", order.get("transactionDate", order.get("date", "")) + ) + + # Filter by date if 'since' is provided + if since and purchase_date: + try: + txn_dt = datetime.fromisoformat(purchase_date.replace("Z", "+00:00")) + if txn_dt < since: + continue + except (ValueError, TypeError): + pass + + if not order_id: + continue + + await self.human_delay(1000, 2500) + + # Fetch receipt detail + detail = await self._fetch_receipt_detail(page, order_id) + + raw_store = ( + order.get("storeNumber") + or order.get("divisionNumber") + or order.get("storeId") + or "" + ) + store_number = str(raw_store) + + receipts.append( + RawReceipt( + receipt_id=order_id, + purchase_date=purchase_date, + store_number=store_number, + raw_data={**order, "detail": detail}, + source_url=f"{KROGER_RECEIPT_DETAIL_API}?orderId={order_id}", + ) + ) + + logger.info("Scraped %d receipts from Kroger", len(receipts)) + return receipts + + async def _fetch_receipt_detail(self, page: Page, order_id: str) -> dict: + """Fetch detailed receipt data for a single Kroger order.""" + try: + url = f"{KROGER_RECEIPT_DETAIL_API}?orderId={order_id}" + api_response = await page.request.get(url) + if not api_response.ok: + logger.warning( + "Kroger receipt detail request failed for %s: %d", + order_id, + api_response.status, + ) + return {} + detail = await api_response.json() + return detail if isinstance(detail, dict) else {} + except Exception: + logger.exception("Failed to fetch Kroger receipt detail for %s", order_id) + return {} + + def parse_receipt(self, raw: RawReceipt) -> dict: + """Parse raw Kroger receipt into structured purchase data.""" + from receiptwitness.parsers.kroger import parse_kroger_receipt + + return parse_kroger_receipt(raw) diff --git a/src/receiptwitness/scrapers/meijer.py b/src/receiptwitness/scrapers/meijer.py new file mode 100644 index 0000000..4a4dd8e --- /dev/null +++ b/src/receiptwitness/scrapers/meijer.py @@ -0,0 +1,301 @@ +"""Meijer mPerks scraper using Playwright. + +Meijer has no public API. We reverse-engineer the XHR endpoints the mPerks +web app uses to pull purchase history and receipt data. The flow: + +1. Launch stealth Playwright browser +2. Navigate to mPerks login page and authenticate +3. Capture session cookies after successful login +4. Use those cookies to hit the mPerks receipt API endpoints directly +5. Parse receipt JSON into structured PurchaseCreate records + +Key endpoints (reverse-engineered from mPerks SPA): +- Login: POST https://www.meijer.com/bin/meijer/account/login +- Receipts: GET https://www.meijer.com/bin/meijer/profile/purchasehistory +- Receipt detail: GET https://www.meijer.com/bin/meijer/profile/receipt?receiptId=... +""" + +import logging +from datetime import UTC, datetime, timedelta +from typing import cast + +from playwright.async_api import BrowserContext, Page, Playwright, async_playwright + +from receiptwitness.config import settings +from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData + +logger = logging.getLogger(__name__) + +# Meijer mPerks URLs +MEIJER_BASE = "https://www.meijer.com" +MEIJER_LOGIN_PAGE = f"{MEIJER_BASE}/shopping/login.html" +MEIJER_LOGIN_API = f"{MEIJER_BASE}/bin/meijer/account/login" +MEIJER_PURCHASE_HISTORY = f"{MEIJER_BASE}/bin/meijer/profile/purchasehistory" +MEIJER_RECEIPT_DETAIL = f"{MEIJER_BASE}/bin/meijer/profile/receipt" +MEIJER_MPERKS_HOME = f"{MEIJER_BASE}/mperks.html" + +# Realistic browser fingerprint +DEFAULT_USER_AGENT = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/131.0.0.0 Safari/537.36" +) +DEFAULT_VIEWPORT = {"width": 1920, "height": 1080} +DEFAULT_LOCALE = "en-US" +DEFAULT_TIMEZONE = "America/Detroit" # Meijer HQ is in Grand Rapids, MI + + +class MeijerScraper(BaseScraper): + """Scraper for Meijer mPerks purchase history.""" + + async def _create_stealth_context( + self, playwright_instance: Playwright, cookies: list[dict] | None = None + ) -> BrowserContext: + """Create a browser context with stealth settings.""" + browser = await playwright_instance.chromium.launch( + headless=settings.headless, + args=[ + "--disable-blink-features=AutomationControlled", + "--no-sandbox", + ], + ) + context = await browser.new_context( + user_agent=DEFAULT_USER_AGENT, + viewport=DEFAULT_VIEWPORT, # type: ignore[arg-type] + locale=DEFAULT_LOCALE, + timezone_id=DEFAULT_TIMEZONE, + java_script_enabled=True, + bypass_csp=False, + ) + # Mask webdriver flag + await context.add_init_script( + """ + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + // Mask chrome automation indicators + window.chrome = { runtime: {} }; + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5] + }); + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'] + }); + """ + ) + if cookies: + await context.add_cookies(cookies) # type: ignore[arg-type] + return cast(BrowserContext, context) + + async def login(self, username: str, password: str) -> SessionData: + """Log in to Meijer mPerks and capture session cookies. + + The mPerks login flow: + 1. Navigate to login page + 2. Fill email and password fields + 3. Click sign-in button + 4. Wait for redirect to mPerks dashboard + 5. Extract session cookies + """ + async with async_playwright() as p: + context = await self._create_stealth_context(p) + page = await context.new_page() + try: + return await self._perform_login(page, context, username, password) + finally: + if context.browser: + await context.browser.close() + + async def _perform_login( + self, page: Page, context: BrowserContext, username: str, password: str + ) -> SessionData: + """Execute the login flow on the mPerks portal.""" + logger.info("Navigating to Meijer login page") + await page.goto(MEIJER_LOGIN_PAGE, wait_until="networkidle") + await self.human_delay(1500, 3000) + + # Fill email field + email_input = page.locator('input[type="email"], input[name="email"], #email') + await email_input.wait_for(state="visible", timeout=settings.browser_timeout_ms) + await email_input.click() + await self.human_delay(200, 500) + await email_input.fill(username) + await self.human_delay(500, 1000) + + # Fill password field + password_input = page.locator('input[type="password"], input[name="password"], #password') + await password_input.wait_for(state="visible", timeout=settings.browser_timeout_ms) + await password_input.click() + await self.human_delay(200, 500) + await password_input.fill(password) + await self.human_delay(500, 1500) + + # Click sign-in button + sign_in_btn = page.locator( + 'button[type="submit"], button:has-text("Sign In"), button:has-text("Log In")' + ) + await sign_in_btn.click() + + # Wait for navigation after login + await page.wait_for_url( + lambda url: "login" not in url.lower(), + timeout=settings.browser_timeout_ms, + ) + await self.human_delay(1000, 2000) + + # Capture cookies + raw_cookies = await context.cookies() + cookies = [dict(c) for c in raw_cookies] + now = datetime.now(UTC) + + logger.info("Meijer login successful, captured %d cookies", len(cookies)) + return SessionData( + cookies=cookies, + user_agent=DEFAULT_USER_AGENT, + created_at=now, + expires_at=now + timedelta(hours=4), + ) + + async def check_session(self, session: SessionData) -> bool: + """Check if the mPerks session is still valid. + + Makes a lightweight request to the mPerks home page and checks + if we get redirected to login (session expired) or not. + """ + if session.expires_at and datetime.now(UTC) > session.expires_at: + logger.info("Meijer session expired based on timestamp") + return False + + async with async_playwright() as p: + context = await self._create_stealth_context(p, cookies=session.cookies) + page = await context.new_page() + try: + response = await page.goto(MEIJER_MPERKS_HOME, wait_until="networkidle") + current_url = page.url.lower() + is_valid = "login" not in current_url and response is not None and response.ok + logger.info("Meijer session check: valid=%s (url=%s)", is_valid, page.url) + return is_valid + except Exception: + logger.exception("Meijer session check failed") + return False + finally: + if context.browser: + await context.browser.close() + + async def scrape_receipts( + self, session: SessionData, since: datetime | None = None + ) -> list[RawReceipt]: + """Scrape purchase history from Meijer mPerks. + + Uses the XHR endpoints the mPerks SPA calls to fetch receipt data. + The purchase history endpoint returns a list of recent transactions, + and we can fetch individual receipt details for line items. + """ + async with async_playwright() as p: + context = await self._create_stealth_context(p, cookies=session.cookies) + page = await context.new_page() + try: + return await self._fetch_receipts(page, since) + finally: + if context.browser: + await context.browser.close() + + async def _fetch_receipts(self, page: Page, since: datetime | None) -> list[RawReceipt]: + """Fetch receipt list and detail via mPerks XHR endpoints. + + Uses Playwright's page.request API (APIRequestContext) instead of + page.evaluate(fetch(...)) for better observability — requests show up + in Playwright traces and can be intercepted by route handlers. + """ + # Navigate to mPerks to establish context (cookies need domain context) + await page.goto(MEIJER_MPERKS_HOME, wait_until="networkidle") + await self.human_delay(1000, 2000) + + receipts: list[RawReceipt] = [] + + # Fetch purchase history listing via page.request (APIRequestContext) + api_response = await page.request.get(MEIJER_PURCHASE_HISTORY) + if not api_response.ok: + logger.warning( + "Purchase history request failed: %d %s", + api_response.status, + api_response.status_text, + ) + return [] + + response = await api_response.json() + + if not isinstance(response, dict): + logger.warning("Unexpected purchase history response type: %s", type(response)) + return [] + + transactions = response.get("transactions", response.get("purchaseHistory", [])) + if not isinstance(transactions, list): + logger.warning("No transactions found in purchase history response") + return [] + + logger.info("Found %d transactions in Meijer purchase history", len(transactions)) + + for txn in transactions: + receipt_id = str(txn.get("transactionId", txn.get("receiptId", ""))) + purchase_date = txn.get("transactionDate", txn.get("purchaseDate", "")) + + # Filter by date if 'since' is provided + if since and purchase_date: + try: + txn_dt = datetime.fromisoformat(purchase_date.replace("Z", "+00:00")) + if txn_dt < since: + continue + except (ValueError, TypeError): + pass + + if not receipt_id: + continue + + await self.human_delay(800, 2000) + + # Fetch receipt detail + detail = await self._fetch_receipt_detail(page, receipt_id) + + receipts.append( + RawReceipt( + receipt_id=receipt_id, + purchase_date=purchase_date, + store_number=str(txn.get("storeNumber", txn.get("storeId", ""))), + raw_data={**txn, "detail": detail}, + source_url=f"{MEIJER_RECEIPT_DETAIL}?receiptId={receipt_id}", + ) + ) + + logger.info("Scraped %d receipts from Meijer", len(receipts)) + return receipts + + async def _fetch_receipt_detail(self, page: Page, receipt_id: str) -> dict: + """Fetch detailed receipt data for a single transaction. + + Uses Playwright's page.request API for traceability. + """ + try: + url = f"{MEIJER_RECEIPT_DETAIL}?receiptId={receipt_id}" + api_response = await page.request.get(url) + if not api_response.ok: + logger.warning( + "Receipt detail request failed for %s: %d", + receipt_id, + api_response.status, + ) + return {} + detail = await api_response.json() + return detail if isinstance(detail, dict) else {} + except Exception: + logger.exception("Failed to fetch receipt detail for %s", receipt_id) + return {} + + def parse_receipt(self, raw: RawReceipt) -> dict: + """Parse raw Meijer receipt into structured purchase data. + + Delegates to the dedicated parser module. + """ + from receiptwitness.parsers.meijer import parse_meijer_receipt + + return parse_meijer_receipt(raw) diff --git a/src/receiptwitness/scrapers/target.py b/src/receiptwitness/scrapers/target.py new file mode 100644 index 0000000..1f959a6 --- /dev/null +++ b/src/receiptwitness/scrapers/target.py @@ -0,0 +1,326 @@ +"""Target Circle scraper using Playwright. + +Target stores ~1 year of in-store purchase history tied to Circle accounts. +Purchases appear when the user pays with a linked card, uses the Target app +wallet, or enters their Circle phone number at checkout. + +Key endpoints (reverse-engineered from target.com SPA): +- Login: POST https://gsp.target.com/gsp/authentications/v1/auth_codes +- Order history: GET https://api.target.com/order_history/v1/orders (in-store tab) +- Receipt detail: GET https://api.target.com/order_history/v1/orders/{orderId} +""" + +import logging +from datetime import UTC, datetime, timedelta +from typing import cast + +from playwright.async_api import BrowserContext, Page, Playwright, async_playwright + +from receiptwitness.config import settings +from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData + +logger = logging.getLogger(__name__) + +# Target endpoints +TARGET_BASE = "https://www.target.com" +TARGET_LOGIN_PAGE = f"{TARGET_BASE}/login" +TARGET_ACCOUNT_PAGE = f"{TARGET_BASE}/account" +TARGET_ORDER_HISTORY = f"{TARGET_BASE}/account/orders" +TARGET_ORDER_API = "https://api.target.com/order_history/v1/orders" +TARGET_RECEIPT_API = "https://api.target.com/order_history/v1/orders" + +# Realistic browser fingerprint — Chrome on Windows +DEFAULT_USER_AGENT = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/131.0.0.0 Safari/537.36" +) +DEFAULT_VIEWPORT = {"width": 1920, "height": 1080} +DEFAULT_LOCALE = "en-US" +DEFAULT_TIMEZONE = "America/Detroit" # SE Michigan coverage + + +class TargetScraper(BaseScraper): + """Scraper for Target Circle in-store purchase history. + + Target's order history SPA loads purchase data from internal API + endpoints. This scraper authenticates via the web login flow, + captures session cookies, and uses those to hit the order history + API for in-store receipt data. + """ + + async def _create_stealth_context( + self, playwright_instance: Playwright, cookies: list[dict] | None = None + ) -> BrowserContext: + """Create a browser context with stealth settings for Target.""" + browser = await playwright_instance.chromium.launch( + headless=settings.headless, + args=[ + "--disable-blink-features=AutomationControlled", + "--no-sandbox", + "--disable-dev-shm-usage", + ], + ) + context = await browser.new_context( + user_agent=DEFAULT_USER_AGENT, + viewport=DEFAULT_VIEWPORT, # type: ignore[arg-type] + locale=DEFAULT_LOCALE, + timezone_id=DEFAULT_TIMEZONE, + java_script_enabled=True, + bypass_csp=False, + color_scheme="light", + has_touch=False, + ) + # Mask webdriver and automation signals + await context.add_init_script( + """ + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + + window.chrome = { + runtime: {}, + loadTimes: function() {}, + csi: function() {}, + app: { isInstalled: false } + }; + + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5] + }); + + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'] + }); + + Object.defineProperty(navigator, 'platform', { + get: () => 'Win32' + }); + + Object.defineProperty(navigator, 'hardwareConcurrency', { + get: () => 8 + }); + + Object.defineProperty(navigator, 'deviceMemory', { + get: () => 8 + }); + """ + ) + if cookies: + await context.add_cookies(cookies) # type: ignore[arg-type] + return cast(BrowserContext, context) + + async def login(self, username: str, password: str) -> SessionData: + """Log in to Target and capture session cookies.""" + async with async_playwright() as p: + context = await self._create_stealth_context(p) + page = await context.new_page() + try: + return await self._perform_login(page, context, username, password) + finally: + if context.browser: + await context.browser.close() + + async def _perform_login( + self, page: Page, context: BrowserContext, username: str, password: str + ) -> SessionData: + """Execute the Target login flow.""" + logger.info("Navigating to Target sign-in page") + await page.goto(TARGET_LOGIN_PAGE, wait_until="networkidle") + await self.human_delay(2000, 4000) + + # Target login form — email/username field + email_input = page.locator( + 'input[id="username"], ' + 'input[name="username"], ' + 'input[type="email"], ' + 'input[data-test="username"]' + ) + await email_input.wait_for(state="visible", timeout=settings.browser_timeout_ms) + await email_input.click() + await self.human_delay(300, 700) + await email_input.fill(username) + await self.human_delay(800, 1500) + + # Password field + password_input = page.locator( + 'input[id="password"], ' + 'input[name="password"], ' + 'input[type="password"], ' + 'input[data-test="password"]' + ) + await password_input.wait_for(state="visible", timeout=settings.browser_timeout_ms) + await password_input.click() + await self.human_delay(300, 700) + await password_input.fill(password) + await self.human_delay(1000, 2000) + + # Sign-in button + sign_in_btn = page.locator( + 'button[id="login"], ' + 'button[data-test="login-button"], ' + 'button[type="submit"]:has-text("Sign in")' + ) + await sign_in_btn.click() + + # Wait for redirect away from login page + await page.wait_for_url( + lambda url: "login" not in url.lower(), + timeout=settings.browser_timeout_ms, + ) + await self.human_delay(1500, 3000) + + # Capture cookies + raw_cookies = await context.cookies() + cookies = [dict(c) for c in raw_cookies] + now = datetime.now(UTC) + + logger.info("Target login successful, captured %d cookies", len(cookies)) + return SessionData( + cookies=cookies, + user_agent=DEFAULT_USER_AGENT, + created_at=now, + expires_at=now + timedelta(hours=2), + extra={"retailer": "target"}, + ) + + async def check_session(self, session: SessionData) -> bool: + """Check if the Target session is still valid.""" + if session.expires_at and datetime.now(UTC) > session.expires_at: + logger.info("Target session expired based on timestamp") + return False + + async with async_playwright() as p: + context = await self._create_stealth_context(p, cookies=session.cookies) + page = await context.new_page() + try: + response = await page.goto(TARGET_ACCOUNT_PAGE, wait_until="networkidle") + current_url = page.url.lower() + is_valid = "login" not in current_url and response is not None and response.ok + logger.info("Target session check: valid=%s (url=%s)", is_valid, page.url) + return is_valid + except Exception: + logger.exception("Target session check failed") + return False + finally: + if context.browser: + await context.browser.close() + + async def scrape_receipts( + self, session: SessionData, since: datetime | None = None + ) -> list[RawReceipt]: + """Scrape in-store purchase history from Target Circle.""" + async with async_playwright() as p: + context = await self._create_stealth_context(p, cookies=session.cookies) + page = await context.new_page() + try: + return await self._fetch_receipts(page, since) + finally: + if context.browser: + await context.browser.close() + + async def _fetch_receipts(self, page: Page, since: datetime | None) -> list[RawReceipt]: + """Fetch receipt list and details from Target order history. + + Target's order history page has separate tabs for online and in-store + purchases. We target the in-store tab which shows Circle-linked + transactions. + """ + # Navigate to order history to establish context + await page.goto(TARGET_ORDER_HISTORY, wait_until="networkidle") + await self.human_delay(1500, 3000) + + receipts: list[RawReceipt] = [] + + # Target order history API — filter for in-store purchases + api_response = await page.request.get( + TARGET_ORDER_API, + params={"channel": "in_store", "limit": "50"}, + ) + if not api_response.ok: + logger.warning( + "Target order history request failed: %d %s", + api_response.status, + api_response.status_text, + ) + return [] + + response = await api_response.json() + if not isinstance(response, dict): + logger.warning("Unexpected order history response type: %s", type(response)) + return [] + + # Target uses "orders" key for in-store purchase list + orders = response.get("orders", response.get("transactions", [])) + if not isinstance(orders, list): + logger.warning("No orders found in Target order history response") + return [] + + logger.info("Found %d in-store orders in Target history", len(orders)) + + for order in orders: + raw_id = order.get("orderId") or order.get("transactionId") or order.get("id") or "" + order_id = str(raw_id) + purchase_date = order.get( + "purchaseDate", + order.get("transactionDate", order.get("date", "")), + ) + + # Filter by date if 'since' is provided + if since and purchase_date: + try: + txn_dt = datetime.fromisoformat(purchase_date.replace("Z", "+00:00")) + if txn_dt < since: + continue + except (ValueError, TypeError): + pass + + if not order_id: + continue + + await self.human_delay(1000, 2500) + + # Fetch receipt detail + detail = await self._fetch_receipt_detail(page, order_id) + + raw_store = ( + order.get("storeNumber") or order.get("storeId") or order.get("locationId") or "" + ) + store_number = str(raw_store) + + receipts.append( + RawReceipt( + receipt_id=order_id, + purchase_date=purchase_date, + store_number=store_number, + raw_data={**order, "detail": detail}, + source_url=f"{TARGET_RECEIPT_API}/{order_id}", + ) + ) + + logger.info("Scraped %d receipts from Target", len(receipts)) + return receipts + + async def _fetch_receipt_detail(self, page: Page, order_id: str) -> dict: + """Fetch detailed receipt data for a single Target order.""" + try: + url = f"{TARGET_RECEIPT_API}/{order_id}" + api_response = await page.request.get(url) + if not api_response.ok: + logger.warning( + "Target receipt detail request failed for %s: %d", + order_id, + api_response.status, + ) + return {} + detail = await api_response.json() + return detail if isinstance(detail, dict) else {} + except Exception: + logger.exception("Failed to fetch Target receipt detail for %s", order_id) + return {} + + def parse_receipt(self, raw: RawReceipt) -> dict: + """Parse raw Target receipt into structured purchase data.""" + from receiptwitness.parsers.target import parse_target_receipt + + return parse_target_receipt(raw) diff --git a/src/receiptwitness/session/__init__.py b/src/receiptwitness/session/__init__.py new file mode 100644 index 0000000..70beaef --- /dev/null +++ b/src/receiptwitness/session/__init__.py @@ -0,0 +1 @@ +"""Session management — encrypted cookie storage and refresh logic.""" diff --git a/src/receiptwitness/session/encryption.py b/src/receiptwitness/session/encryption.py new file mode 100644 index 0000000..b406bcf --- /dev/null +++ b/src/receiptwitness/session/encryption.py @@ -0,0 +1,52 @@ +"""Fernet-based encryption for session cookies at rest. + +Session data (cookies, tokens) is encrypted before writing to the database +and decrypted only when needed for a scrape. The encryption key is provided +via the RW_SESSION_ENCRYPTION_KEY environment variable — it is never stored +in the database or logged. +""" + +import json +import logging + +from cryptography.fernet import Fernet, InvalidToken + +from receiptwitness.config import settings + +logger = logging.getLogger(__name__) + + +def _get_fernet() -> Fernet: + """Get a Fernet instance using the configured encryption key.""" + key = settings.session_encryption_key + if not key: + raise ValueError( + "RW_SESSION_ENCRYPTION_KEY is not set. " + "Generate one with: " + "python -c 'from cryptography.fernet import Fernet; " + "print(Fernet.generate_key().decode())'" + ) + return Fernet(key.encode() if isinstance(key, str) else key) + + +def encrypt_session_data(data: dict) -> str: + """Encrypt session data dict to a Fernet token string. + + The data is JSON-serialized, then encrypted. The result is a + URL-safe base64-encoded string suitable for storing in JSONB. + """ + f = _get_fernet() + plaintext = json.dumps(data, default=str).encode("utf-8") + return f.encrypt(plaintext).decode("utf-8") + + +def decrypt_session_data(encrypted: str) -> dict: + """Decrypt a Fernet token string back to a session data dict.""" + f = _get_fernet() + try: + plaintext = f.decrypt(encrypted.encode("utf-8")) + result: dict = json.loads(plaintext) + return result + except InvalidToken: + logger.error("Failed to decrypt session data — invalid token or wrong key") + raise diff --git a/src/receiptwitness/session/manager.py b/src/receiptwitness/session/manager.py new file mode 100644 index 0000000..205ccbd --- /dev/null +++ b/src/receiptwitness/session/manager.py @@ -0,0 +1,81 @@ +"""Session storage, retrieval, and refresh logic. + +Manages the lifecycle of retailer session data: +- Load encrypted session from DB +- Check validity via scraper +- Re-authenticate if expired +- Save new session back (encrypted) +""" + +import logging +from dataclasses import asdict +from datetime import UTC, datetime + +from receiptwitness.scrapers.base import BaseScraper, SessionData +from receiptwitness.session.encryption import decrypt_session_data, encrypt_session_data + +logger = logging.getLogger(__name__) + + +def session_from_db_record(session_data_encrypted: str | None) -> SessionData | None: + """Deserialize and decrypt a session from the database. + + The session_data column in user_store_accounts stores the Fernet-encrypted + JSON of the SessionData fields. + """ + if not session_data_encrypted: + return None + + try: + data = decrypt_session_data(session_data_encrypted) + return SessionData( + cookies=data["cookies"], + user_agent=data["user_agent"], + created_at=datetime.fromisoformat(data["created_at"]), + expires_at=( + datetime.fromisoformat(data["expires_at"]) if data.get("expires_at") else None + ), + extra=data.get("extra", {}), + ) + except Exception: + logger.exception("Failed to load session from DB record") + return None + + +def session_to_db_value(session: SessionData) -> str: + """Serialize and encrypt a session for database storage.""" + data = asdict(session) + # Convert datetime objects to ISO strings for JSON serialization + data["created_at"] = session.created_at.isoformat() + if session.expires_at: + data["expires_at"] = session.expires_at.isoformat() + return encrypt_session_data(data) + + +async def get_valid_session( + scraper: BaseScraper, + session_data_encrypted: str | None, + username: str, + password: str, +) -> tuple[SessionData, bool]: + """Get a valid session, re-authenticating if needed. + + Returns: + A tuple of (session, was_refreshed). If was_refreshed is True, + the caller should persist the new session to the database. + """ + # Try existing session first + existing = session_from_db_record(session_data_encrypted) + if existing: + if existing.expires_at and datetime.now(UTC) > existing.expires_at: + logger.info("Session expired by timestamp, re-authenticating") + elif await scraper.check_session(existing): + logger.info("Existing session is valid") + return existing, False + else: + logger.info("Session check failed, re-authenticating") + + # Need to re-authenticate + logger.info("Performing fresh login") + new_session = await scraper.login(username, password) + return new_session, True diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..a8b29ba --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,29 @@ +"""Shared test fixtures.""" + +import json +from pathlib import Path + +import pytest + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + +@pytest.fixture +def meijer_receipt_data() -> dict: + """Load the sample Meijer receipt fixture.""" + with open(FIXTURES_DIR / "meijer_receipt.json") as f: + return json.load(f) + + +@pytest.fixture +def kroger_receipt_data() -> dict: + """Load the sample Kroger receipt fixture.""" + with open(FIXTURES_DIR / "kroger_receipt.json") as f: + return json.load(f) + + +@pytest.fixture +def target_receipt_data() -> dict: + """Load the sample Target receipt fixture.""" + with open(FIXTURES_DIR / "target_receipt.json") as f: + return json.load(f) diff --git a/tests/fixtures/kroger_receipt.json b/tests/fixtures/kroger_receipt.json new file mode 100644 index 0000000..51c0481 --- /dev/null +++ b/tests/fixtures/kroger_receipt.json @@ -0,0 +1,131 @@ +{ + "orderId": "KR-2026-0312-4471", + "purchaseDate": "2026-03-12T16:45:00Z", + "storeNumber": "00357", + "divisionNumber": "014", + "total": 94.17, + "savings": 15.30, + "detail": { + "receiptId": "KR-2026-0312-4471", + "items": [ + { + "description": "KROGER WHOLE MILK GAL", + "upc": "0001111041700", + "quantity": 1, + "basePrice": 3.99, + "totalPrice": 3.99, + "regularPrice": 4.29, + "salePrice": 3.99, + "couponAmount": 0.0, + "plusCardSavings": 0.30, + "department": "DAIRY" + }, + { + "description": "BANANAS", + "upc": "0000000004011", + "quantity": 1, + "basePrice": 0.59, + "totalPrice": 0.59, + "regularPrice": 0.59, + "salePrice": null, + "couponAmount": null, + "plusCardSavings": null, + "department": "PRODUCE" + }, + { + "description": "SIMPLE TRUTH ORG EGGS 12CT", + "upc": "0001111087840", + "quantity": 2, + "basePrice": 5.49, + "totalPrice": 10.98, + "regularPrice": 5.99, + "salePrice": 5.49, + "couponAmount": 0.0, + "plusCardSavings": 1.00, + "department": "DAIRY" + }, + { + "description": "KROGER DELI TURKEY BREAST", + "upc": null, + "quantity": 0.68, + "basePrice": 9.99, + "totalPrice": 6.79, + "regularPrice": 9.99, + "salePrice": null, + "weight": 0.68, + "weightUom": "LB", + "department": "DELI" + }, + { + "description": "TIDE PODS 42CT", + "upc": "0003700096223", + "quantity": 1, + "basePrice": 13.99, + "totalPrice": 13.99, + "regularPrice": 15.99, + "salePrice": 13.99, + "couponAmount": 2.00, + "plusCardSavings": 0.0, + "department": "HOUSEHOLD" + }, + { + "description": "VOIDED DORITOS NACHO", + "upc": "0002840032505", + "quantity": 1, + "basePrice": 4.79, + "totalPrice": 4.79, + "voided": true, + "department": "SNACKS" + }, + { + "description": "RETURNED GATORADE 8PK", + "upc": "0005200012505", + "quantity": 1, + "basePrice": 7.99, + "totalPrice": 7.99, + "status": "RETURNED", + "department": "BEVERAGES" + }, + { + "description": "KROGER SHARP CHEDDAR 8OZ", + "upc": "0001111060930", + "quantity": 1, + "basePrice": 3.49, + "totalPrice": 3.49, + "regularPrice": 3.49, + "salePrice": null, + "couponAmount": null, + "plusCardSavings": null, + "department": "DAIRY" + }, + { + "description": "PRIVATE SELECTION PASTA", + "upc": "0001111085612", + "quantity": 3, + "basePrice": 2.49, + "totalPrice": 7.47, + "regularPrice": 2.99, + "salePrice": 2.49, + "couponAmount": 0.0, + "plusCardSavings": 1.50, + "department": "GROCERY" + }, + { + "description": "KROGER GROUND BEEF 80/20", + "upc": null, + "quantity": 1.23, + "basePrice": 5.99, + "totalPrice": 7.37, + "regularPrice": 6.99, + "salePrice": 5.99, + "weight": 1.23, + "weightUom": "LB", + "department": "MEAT" + } + ], + "subtotal": 78.47, + "tax": 5.50, + "total": 94.17, + "totalSavings": 15.30 + } +} diff --git a/tests/fixtures/meijer_receipt.json b/tests/fixtures/meijer_receipt.json new file mode 100644 index 0000000..a733215 --- /dev/null +++ b/tests/fixtures/meijer_receipt.json @@ -0,0 +1,85 @@ +{ + "transactionId": "TXN-2026-0310-001", + "transactionDate": "2026-03-10T14:30:00Z", + "storeNumber": "42", + "total": 87.42, + "savings": 12.50, + "detail": { + "receiptId": "TXN-2026-0310-001", + "items": [ + { + "description": "ORGANIC BANANAS", + "upc": "0000000004011", + "quantity": 1, + "price": 0.69, + "extendedPrice": 0.69, + "regularPrice": 0.79, + "salePrice": 0.69, + "couponDiscount": 0.0, + "mperksDiscount": 0.10, + "category": "PRODUCE" + }, + { + "description": "MEIJER 2% MILK GAL", + "upc": "0041250000123", + "quantity": 2, + "price": 3.49, + "extendedPrice": 6.98, + "regularPrice": 3.79, + "salePrice": 3.49, + "couponDiscount": 0.0, + "mperksDiscount": 0.0, + "category": "DAIRY" + }, + { + "description": "CHEERIOS 18OZ", + "upc": "0016000275614", + "quantity": 1, + "price": 4.99, + "extendedPrice": 4.99, + "regularPrice": 5.49, + "salePrice": null, + "couponDiscount": 0.50, + "mperksDiscount": 0.0, + "category": "CEREAL" + }, + { + "description": "WEIGHTED DELI TURKEY", + "upc": null, + "quantity": 0.75, + "price": 8.99, + "extendedPrice": 6.74, + "regularPrice": 8.99, + "salePrice": null, + "couponDiscount": null, + "mperksDiscount": null, + "category": "DELI" + }, + { + "description": "VOIDED SODA 12PK", + "upc": "0004900005678", + "quantity": 1, + "price": 5.99, + "extendedPrice": 5.99, + "voided": true, + "category": "BEVERAGES" + }, + { + "description": "MEIJER PAPER TOWELS 6PK", + "upc": "0041250099001", + "quantity": 1, + "price": 7.99, + "extendedPrice": 7.99, + "regularPrice": 9.99, + "salePrice": 7.99, + "couponDiscount": 1.00, + "mperksDiscount": 1.00, + "category": "HOUSEHOLD" + } + ], + "subtotal": 74.92, + "tax": 5.24, + "total": 87.42, + "totalSavings": 12.50 + } +} diff --git a/tests/fixtures/target_receipt.json b/tests/fixtures/target_receipt.json new file mode 100644 index 0000000..c76bb5b --- /dev/null +++ b/tests/fixtures/target_receipt.json @@ -0,0 +1,140 @@ +{ + "orderId": "TGT-2026-0315-7890", + "purchaseDate": "2026-03-15T11:23:00Z", + "storeNumber": "2774", + "total": 83.21, + "savings": 11.45, + "detail": { + "receiptId": "TGT-2026-0315-7890", + "items": [ + { + "description": "GOOD & GATHER WHOLE MILK GAL", + "tcin": "14767459", + "upc": "0085239100123", + "quantity": 1, + "unitPrice": 3.89, + "totalPrice": 3.89, + "regularPrice": 4.19, + "circlePrice": 3.89, + "couponDiscount": 0.0, + "circleRewardsDiscount": 0.30, + "promoDescription": "Circle offer: Save 30c", + "department": "GROCERY" + }, + { + "description": "BANANAS", + "upc": "0000000004011", + "quantity": 1, + "unitPrice": 0.25, + "totalPrice": 0.25, + "regularPrice": 0.25, + "circlePrice": null, + "couponDiscount": null, + "circleRewardsDiscount": null, + "department": "PRODUCE" + }, + { + "description": "MARKET PANTRY LARGE EGGS 18CT", + "tcin": "13292174", + "upc": "0085239206753", + "quantity": 2, + "unitPrice": 4.99, + "totalPrice": 9.98, + "regularPrice": 5.49, + "circlePrice": 4.99, + "couponDiscount": 0.0, + "circleRewardsDiscount": 1.00, + "promoDescription": "Circle offer: 2 for $10", + "department": "GROCERY" + }, + { + "description": "DELI SLICED TURKEY BREAST", + "upc": null, + "quantity": 0.72, + "unitPrice": 10.99, + "totalPrice": 7.91, + "regularPrice": 10.99, + "weight": 0.72, + "weightUom": "LB", + "department": "DELI" + }, + { + "description": "TIDE PODS 42CT", + "tcin": "76150253", + "upc": "0003700096223", + "quantity": 1, + "unitPrice": 13.49, + "totalPrice": 13.49, + "regularPrice": 15.99, + "circlePrice": 13.49, + "couponDiscount": 2.50, + "circleRewardsDiscount": 0.0, + "promoDescription": "Circle offer + mfr coupon", + "department": "HOUSEHOLD" + }, + { + "description": "UP&UP PAPER TOWELS 6PK", + "tcin": "52493117", + "upc": "0085239401567", + "quantity": 1, + "unitPrice": 8.99, + "totalPrice": 8.99, + "regularPrice": 8.99, + "circlePrice": null, + "couponDiscount": null, + "circleRewardsDiscount": null, + "department": "HOUSEHOLD" + }, + { + "description": "VOIDED COCA-COLA 12PK", + "upc": "0004900002521", + "quantity": 1, + "unitPrice": 7.49, + "totalPrice": 7.49, + "voided": true, + "department": "BEVERAGES" + }, + { + "description": "RETURNED OLAY MOISTURIZER", + "upc": "0007560402118", + "quantity": 1, + "unitPrice": 12.99, + "totalPrice": 12.99, + "status": "RETURNED", + "department": "BEAUTY" + }, + { + "description": "FAVOURITE DAY TRAIL MIX", + "tcin": "83921045", + "dpci": "271-09-0142", + "upc": "0085239700891", + "quantity": 1, + "unitPrice": 5.49, + "totalPrice": 5.49, + "regularPrice": 5.49, + "circlePrice": null, + "couponDiscount": null, + "circleRewardsDiscount": null, + "department": "SNACKS" + }, + { + "description": "BOGO GOOD & GATHER PASTA", + "tcin": "78114326", + "upc": "0085239300456", + "quantity": 2, + "unitPrice": 1.79, + "totalPrice": 1.79, + "regularPrice": 1.79, + "circlePrice": 0.895, + "couponDiscount": 0.0, + "circleRewardsDiscount": 1.79, + "promoDescription": "Buy 1 get 1 free", + "department": "GROCERY" + } + ], + "subtotal": 78.32, + "tax": 4.89, + "total": 83.21, + "totalSavings": 11.45 + } +} diff --git a/tests/test_parsers/__init__.py b/tests/test_parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_parsers/test_kroger_parser.py b/tests/test_parsers/test_kroger_parser.py new file mode 100644 index 0000000..001d205 --- /dev/null +++ b/tests/test_parsers/test_kroger_parser.py @@ -0,0 +1,399 @@ +"""Tests for the Kroger receipt parser.""" + +from decimal import Decimal + +from receiptwitness.parsers.kroger import _parse_item, _to_decimal, parse_kroger_receipt +from receiptwitness.scrapers.base import RawReceipt + + +class TestToDecimal: + def test_from_int(self): + assert _to_decimal(42) == Decimal("42") + + def test_from_float(self): + assert _to_decimal(3.99) == Decimal("3.99") + + def test_from_string(self): + assert _to_decimal("7.49") == Decimal("7.49") + + def test_none_returns_default(self): + assert _to_decimal(None) == Decimal("0") + + def test_none_custom_default(self): + assert _to_decimal(None, "1") == Decimal("1") + + def test_invalid_string_returns_default(self): + assert _to_decimal("not-a-number") == Decimal("0") + + def test_empty_string_returns_default(self): + assert _to_decimal("") == Decimal("0") + + +class TestParseItem: + def test_standard_item(self): + raw = { + "description": "KROGER WHOLE MILK GAL", + "upc": "0001111041700", + "quantity": 1, + "basePrice": 3.99, + "totalPrice": 3.99, + "regularPrice": 4.29, + "salePrice": 3.99, + "couponAmount": 0.0, + "plusCardSavings": 0.30, + "department": "DAIRY", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "KROGER WHOLE MILK GAL" + assert result["upc"] == "1111041700" + assert result["quantity"] == Decimal("1") + assert result["unit_price"] == Decimal("3.99") + assert result["extended_price"] == Decimal("3.99") + assert result["regular_price"] == Decimal("4.29") + assert result["sale_price"] == Decimal("3.99") + assert result["loyalty_discount"] == Decimal("0.30") + assert result["category_raw"] == "DAIRY" + + def test_weighted_item(self): + raw = { + "description": "KROGER DELI TURKEY BREAST", + "quantity": 0.68, + "basePrice": 9.99, + "totalPrice": 6.79, + "weight": 0.68, + "weightUom": "LB", + "department": "DELI", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "KROGER DELI TURKEY BREAST" + assert result["upc"] is None + assert result["quantity"] == Decimal("0.68") + assert result["unit_price"] == Decimal("9.99") + assert result["extended_price"] == Decimal("6.79") + + def test_missing_extended_price_computed(self): + raw = { + "description": "TEST ITEM", + "quantity": 3, + "basePrice": 2.49, + } + result = _parse_item(raw) + assert result["extended_price"] == Decimal("2.49") * Decimal("3") + + def test_item_with_coupon(self): + raw = { + "description": "TIDE PODS 42CT", + "upc": "0003700096223", + "quantity": 1, + "basePrice": 13.99, + "totalPrice": 13.99, + "couponAmount": 2.00, + } + result = _parse_item(raw) + assert result["coupon_discount"] == Decimal("2.00") + + def test_missing_description_fallback(self): + raw = {"basePrice": 1.00, "totalPrice": 1.00} + result = _parse_item(raw) + assert result["product_name_raw"] == "UNKNOWN ITEM" + + def test_alternative_field_names_product_name(self): + raw = { + "productName": "ALT NAME ITEM", + "unitPrice": 5.00, + "extendedAmount": 5.00, + "qty": 1, + "krogerProductId": "123456789", + "category": "GROCERY", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "ALT NAME ITEM" + assert result["unit_price"] == Decimal("5.00") + assert result["extended_price"] == Decimal("5.00") + assert result["upc"] == "123456789" + assert result["category_raw"] == "GROCERY" + + def test_item_description_field_name(self): + raw = { + "itemDescription": "ITEM DESC FIELD", + "price": 3.00, + "lineTotal": 3.00, + } + result = _parse_item(raw) + assert result["product_name_raw"] == "ITEM DESC FIELD" + assert result["unit_price"] == Decimal("3.00") + assert result["extended_price"] == Decimal("3.00") + + def test_null_optional_fields(self): + raw = { + "description": "BANANAS", + "upc": "0000000004011", + "quantity": 1, + "basePrice": 0.59, + "totalPrice": 0.59, + "salePrice": None, + "couponAmount": None, + "plusCardSavings": None, + } + result = _parse_item(raw) + assert result["sale_price"] is None + assert result["coupon_discount"] is None + assert result["loyalty_discount"] is None + + def test_upc_leading_zeros_stripped(self): + raw = { + "description": "TEST", + "upc": "0000000004011", + "basePrice": 1.00, + "totalPrice": 1.00, + } + result = _parse_item(raw) + assert result["upc"] == "4011" + + def test_upc_from_kroger_product_id(self): + raw = { + "description": "TEST", + "krogerProductId": "987654321", + "basePrice": 1.00, + "totalPrice": 1.00, + } + result = _parse_item(raw) + assert result["upc"] == "987654321" + + def test_description_whitespace_stripped(self): + raw = { + "description": " EXTRA SPACES ", + "basePrice": 1.00, + "totalPrice": 1.00, + } + result = _parse_item(raw) + assert result["product_name_raw"] == "EXTRA SPACES" + + def test_promo_price_field(self): + raw = { + "description": "PROMO ITEM", + "promoPrice": 2.99, + "originalPrice": 4.99, + "basePrice": 2.99, + "totalPrice": 2.99, + } + result = _parse_item(raw) + assert result["sale_price"] == Decimal("2.99") + assert result["regular_price"] == Decimal("4.99") + + def test_loyalty_discount_from_fuel_points(self): + raw = { + "description": "FUEL DISC ITEM", + "fuelPointsDiscount": 0.50, + "basePrice": 3.00, + "totalPrice": 3.00, + } + result = _parse_item(raw) + assert result["loyalty_discount"] == Decimal("0.50") + + def test_multi_quantity_item(self): + raw = { + "description": "PRIVATE SELECTION PASTA", + "quantity": 3, + "basePrice": 2.49, + "totalPrice": 7.47, + "department": "GROCERY", + } + result = _parse_item(raw) + assert result["quantity"] == Decimal("3") + assert result["unit_price"] == Decimal("2.49") + assert result["extended_price"] == Decimal("7.47") + + def test_aisle_as_category(self): + raw = { + "description": "AISLE ITEM", + "aisle": "FROZEN FOODS", + "basePrice": 4.00, + "totalPrice": 4.00, + } + result = _parse_item(raw) + assert result["category_raw"] == "FROZEN FOODS" + + +class TestParseKrogerReceipt: + def test_full_receipt(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + store_number="00357", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + + assert result["receipt_id"] == "KR-2026-0312-4471" + assert result["purchase_date"] == "2026-03-12T16:45:00Z" + assert result["total"] == Decimal("94.17") + assert result["subtotal"] == Decimal("78.47") + assert result["tax"] == Decimal("5.50") + assert result["savings_total"] == Decimal("15.30") + + # Should have 8 items (voided + returned items excluded) + assert len(result["items"]) == 8 + + # Verify first item + milk = result["items"][0] + assert milk["product_name_raw"] == "KROGER WHOLE MILK GAL" + assert milk["upc"] == "1111041700" + + def test_voided_items_excluded(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + + item_names = [i["product_name_raw"] for i in result["items"]] + assert "VOIDED DORITOS NACHO" not in item_names + + def test_returned_items_excluded(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + + item_names = [i["product_name_raw"] for i in result["items"]] + assert "RETURNED GATORADE 8PK" not in item_names + + def test_return_flag_items_excluded(self): + data = { + "detail": { + "items": [ + { + "description": "NORMAL ITEM", + "basePrice": 5.00, + "totalPrice": 5.00, + }, + { + "description": "RETURNED VIA FLAG", + "basePrice": 3.00, + "totalPrice": 3.00, + "returnFlag": True, + }, + { + "description": "IS RETURN ITEM", + "basePrice": 2.00, + "totalPrice": 2.00, + "isReturn": True, + }, + ], + "total": 5.00, + } + } + raw = RawReceipt( + receipt_id="RET-001", + purchase_date="2026-03-12", + raw_data=data, + ) + result = parse_kroger_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "NORMAL ITEM" + + def test_empty_receipt(self): + raw = RawReceipt( + receipt_id="EMPTY-001", + purchase_date="2026-03-12", + raw_data={"detail": {"items": [], "total": 0}}, + ) + result = parse_kroger_receipt(raw) + assert result["items"] == [] + assert result["total"] == Decimal("0") + + def test_receipt_with_no_detail(self): + raw = RawReceipt( + receipt_id="NO-DETAIL-001", + purchase_date="2026-03-12", + raw_data={"total": 50.00}, + ) + result = parse_kroger_receipt(raw) + assert result["items"] == [] + assert result["total"] == Decimal("50.00") + + def test_raw_data_preserved(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + assert result["raw_data"] is kroger_receipt_data + + def test_alternative_total_field_names(self): + raw = RawReceipt( + receipt_id="ALT-001", + purchase_date="2026-03-12", + raw_data={ + "orderTotal": 42.00, + "subTotal": 35.00, + "salesTax": 3.50, + "youSaved": 5.00, + "detail": {"items": []}, + }, + ) + result = parse_kroger_receipt(raw) + assert result["total"] == Decimal("42.00") + assert result["subtotal"] == Decimal("35.00") + assert result["tax"] == Decimal("3.50") + assert result["savings_total"] == Decimal("5.00") + + def test_receipt_items_alternative_key(self): + data = { + "detail": { + "receiptItems": [ + { + "description": "ALT KEY ITEM", + "basePrice": 3.00, + "totalPrice": 3.00, + } + ], + "total": 3.00, + } + } + raw = RawReceipt( + receipt_id="ALT-KEY-001", + purchase_date="2026-03-12", + raw_data=data, + ) + result = parse_kroger_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "ALT KEY ITEM" + + def test_source_url_preserved(self): + raw = RawReceipt( + receipt_id="URL-001", + purchase_date="2026-03-12", + raw_data={"detail": {"items": [], "total": 0}}, + source_url="https://www.kroger.com/atlas/v1/receipt/api?orderId=URL-001", + ) + result = parse_kroger_receipt(raw) + assert result["source_url"] == "https://www.kroger.com/atlas/v1/receipt/api?orderId=URL-001" + + def test_weighted_items_in_full_receipt(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + + # Find the weighted turkey item + turkey = next(i for i in result["items"] if "TURKEY" in i["product_name_raw"]) + assert turkey["quantity"] == Decimal("0.68") + assert turkey["unit_price"] == Decimal("9.99") + assert turkey["extended_price"] == Decimal("6.79") + + def test_grand_total_field(self): + raw = RawReceipt( + receipt_id="GT-001", + purchase_date="2026-03-12", + raw_data={"grandTotal": 99.99, "detail": {"items": []}}, + ) + result = parse_kroger_receipt(raw) + assert result["total"] == Decimal("99.99") diff --git a/tests/test_parsers/test_meijer_parser.py b/tests/test_parsers/test_meijer_parser.py new file mode 100644 index 0000000..47a5fa9 --- /dev/null +++ b/tests/test_parsers/test_meijer_parser.py @@ -0,0 +1,174 @@ +"""Tests for the Meijer receipt parser.""" + +from decimal import Decimal + +from receiptwitness.parsers.meijer import _parse_item, _to_decimal, parse_meijer_receipt +from receiptwitness.scrapers.base import RawReceipt + + +class TestToDecimal: + def test_from_int(self): + assert _to_decimal(42) == Decimal("42") + + def test_from_float(self): + assert _to_decimal(3.49) == Decimal("3.49") + + def test_from_string(self): + assert _to_decimal("7.99") == Decimal("7.99") + + def test_none_returns_default(self): + assert _to_decimal(None) == Decimal("0") + + def test_none_custom_default(self): + assert _to_decimal(None, "1") == Decimal("1") + + def test_invalid_string_returns_default(self): + assert _to_decimal("not-a-number") == Decimal("0") + + +class TestParseItem: + def test_standard_item(self): + raw = { + "description": "ORGANIC BANANAS", + "upc": "0000000004011", + "quantity": 1, + "price": 0.69, + "extendedPrice": 0.69, + "regularPrice": 0.79, + "salePrice": 0.69, + "couponDiscount": 0.0, + "mperksDiscount": 0.10, + "category": "PRODUCE", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "ORGANIC BANANAS" + assert result["upc"] == "4011" + assert result["quantity"] == Decimal("1") + assert result["unit_price"] == Decimal("0.69") + assert result["extended_price"] == Decimal("0.69") + assert result["regular_price"] == Decimal("0.79") + assert result["sale_price"] == Decimal("0.69") + assert result["loyalty_discount"] == Decimal("0.10") + assert result["category_raw"] == "PRODUCE" + + def test_weighted_item(self): + raw = { + "description": "WEIGHTED DELI TURKEY", + "quantity": 0.75, + "price": 8.99, + "extendedPrice": 6.74, + "category": "DELI", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "WEIGHTED DELI TURKEY" + assert result["upc"] is None + assert result["quantity"] == Decimal("0.75") + assert result["unit_price"] == Decimal("8.99") + assert result["extended_price"] == Decimal("6.74") + + def test_missing_extended_price_computed(self): + raw = { + "description": "TEST ITEM", + "quantity": 3, + "price": 2.50, + } + result = _parse_item(raw) + assert result["extended_price"] == Decimal("2.50") * Decimal("3") + + def test_item_with_coupon_discount(self): + raw = { + "description": "CHEERIOS 18OZ", + "upc": "0016000275614", + "quantity": 1, + "price": 4.99, + "extendedPrice": 4.99, + "couponDiscount": 0.50, + } + result = _parse_item(raw) + assert result["coupon_discount"] == Decimal("0.50") + + def test_missing_description_fallback(self): + raw = {"price": 1.00, "extendedPrice": 1.00} + result = _parse_item(raw) + assert result["product_name_raw"] == "UNKNOWN ITEM" + + def test_alternative_field_names(self): + raw = { + "itemDescription": "ALT NAME ITEM", + "unitPrice": 5.00, + "totalPrice": 5.00, + "qty": 1, + "UPC": "123456789", + "departmentDescription": "GROCERY", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "ALT NAME ITEM" + assert result["unit_price"] == Decimal("5.00") + assert result["upc"] == "123456789" + assert result["category_raw"] == "GROCERY" + + +class TestParseMeijerReceipt: + def test_full_receipt(self, meijer_receipt_data): + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + store_number="42", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + + assert result["receipt_id"] == "TXN-2026-0310-001" + assert result["purchase_date"] == "2026-03-10T14:30:00Z" + assert result["total"] == Decimal("87.42") + assert result["subtotal"] == Decimal("74.92") + assert result["tax"] == Decimal("5.24") + assert result["savings_total"] == Decimal("12.50") + + # Should have 5 items (voided item excluded) + assert len(result["items"]) == 5 + + # Verify first item + bananas = result["items"][0] + assert bananas["product_name_raw"] == "ORGANIC BANANAS" + assert bananas["upc"] == "4011" + + def test_voided_items_excluded(self, meijer_receipt_data): + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + + item_names = [i["product_name_raw"] for i in result["items"]] + assert "VOIDED SODA 12PK" not in item_names + + def test_empty_receipt(self): + raw = RawReceipt( + receipt_id="EMPTY-001", + purchase_date="2026-03-10", + raw_data={"detail": {"items": [], "total": 0}}, + ) + result = parse_meijer_receipt(raw) + assert result["items"] == [] + assert result["total"] == Decimal("0") + + def test_receipt_with_no_detail(self): + raw = RawReceipt( + receipt_id="NO-DETAIL-001", + purchase_date="2026-03-10", + raw_data={"total": 50.00}, + ) + result = parse_meijer_receipt(raw) + assert result["items"] == [] + assert result["total"] == Decimal("50.00") + + def test_raw_data_preserved(self, meijer_receipt_data): + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + assert result["raw_data"] is meijer_receipt_data diff --git a/tests/test_parsers/test_target_parser.py b/tests/test_parsers/test_target_parser.py new file mode 100644 index 0000000..8f197ac --- /dev/null +++ b/tests/test_parsers/test_target_parser.py @@ -0,0 +1,471 @@ +"""Tests for the Target receipt parser.""" + +from decimal import Decimal + +from receiptwitness.parsers.target import _parse_item, _to_decimal, parse_target_receipt +from receiptwitness.scrapers.base import RawReceipt + + +class TestToDecimal: + def test_from_int(self): + assert _to_decimal(42) == Decimal("42") + + def test_from_float(self): + assert _to_decimal(3.89) == Decimal("3.89") + + def test_from_string(self): + assert _to_decimal("8.99") == Decimal("8.99") + + def test_none_returns_default(self): + assert _to_decimal(None) == Decimal("0") + + def test_none_custom_default(self): + assert _to_decimal(None, "1") == Decimal("1") + + def test_invalid_string_returns_default(self): + assert _to_decimal("not-a-number") == Decimal("0") + + def test_empty_string_returns_default(self): + assert _to_decimal("") == Decimal("0") + + +class TestParseItem: + def test_standard_item(self): + raw = { + "description": "GOOD & GATHER WHOLE MILK GAL", + "tcin": "14767459", + "upc": "0085239100123", + "quantity": 1, + "unitPrice": 3.89, + "totalPrice": 3.89, + "regularPrice": 4.19, + "circlePrice": 3.89, + "couponDiscount": 0.0, + "circleRewardsDiscount": 0.30, + "department": "GROCERY", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "GOOD & GATHER WHOLE MILK GAL" + assert result["upc"] == "85239100123" + assert result["quantity"] == Decimal("1") + assert result["unit_price"] == Decimal("3.89") + assert result["extended_price"] == Decimal("3.89") + assert result["regular_price"] == Decimal("4.19") + assert result["sale_price"] == Decimal("3.89") + assert result["loyalty_discount"] == Decimal("0.30") + assert result["category_raw"] == "GROCERY" + + def test_weighted_item(self): + raw = { + "description": "DELI SLICED TURKEY BREAST", + "quantity": 0.72, + "unitPrice": 10.99, + "totalPrice": 7.91, + "weight": 0.72, + "weightUom": "LB", + "department": "DELI", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "DELI SLICED TURKEY BREAST" + assert result["upc"] is None + assert result["quantity"] == Decimal("0.72") + assert result["unit_price"] == Decimal("10.99") + assert result["extended_price"] == Decimal("7.91") + + def test_missing_extended_price_computed(self): + raw = { + "description": "TEST ITEM", + "quantity": 3, + "unitPrice": 2.49, + } + result = _parse_item(raw) + assert result["extended_price"] == Decimal("2.49") * Decimal("3") + + def test_item_with_coupon(self): + raw = { + "description": "TIDE PODS 42CT", + "upc": "0003700096223", + "quantity": 1, + "unitPrice": 13.49, + "totalPrice": 13.49, + "couponDiscount": 2.50, + } + result = _parse_item(raw) + assert result["coupon_discount"] == Decimal("2.50") + + def test_missing_description_fallback(self): + raw = {"unitPrice": 1.00, "totalPrice": 1.00} + result = _parse_item(raw) + assert result["product_name_raw"] == "UNKNOWN ITEM" + + def test_alternative_field_names(self): + raw = { + "productName": "ALT NAME ITEM", + "price": 5.00, + "extendedPrice": 5.00, + "qty": 1, + "UPC": "123456789", + "category": "FROZEN", + } + result = _parse_item(raw) + assert result["product_name_raw"] == "ALT NAME ITEM" + assert result["unit_price"] == Decimal("5.00") + assert result["extended_price"] == Decimal("5.00") + assert result["upc"] == "123456789" + assert result["category_raw"] == "FROZEN" + + def test_item_description_field_name(self): + raw = { + "itemDescription": "ITEM DESC FIELD", + "price": 3.00, + "lineTotal": 3.00, + } + result = _parse_item(raw) + assert result["product_name_raw"] == "ITEM DESC FIELD" + assert result["unit_price"] == Decimal("3.00") + assert result["extended_price"] == Decimal("3.00") + + def test_null_optional_fields(self): + raw = { + "description": "BANANAS", + "upc": "0000000004011", + "quantity": 1, + "unitPrice": 0.25, + "totalPrice": 0.25, + "circlePrice": None, + "couponDiscount": None, + "circleRewardsDiscount": None, + } + result = _parse_item(raw) + assert result["sale_price"] is None + assert result["coupon_discount"] is None + assert result["loyalty_discount"] is None + + def test_upc_leading_zeros_stripped(self): + raw = { + "description": "TEST", + "upc": "0000000004011", + "unitPrice": 1.00, + "totalPrice": 1.00, + } + result = _parse_item(raw) + assert result["upc"] == "4011" + + def test_description_whitespace_stripped(self): + raw = { + "description": " EXTRA SPACES ", + "unitPrice": 1.00, + "totalPrice": 1.00, + } + result = _parse_item(raw) + assert result["product_name_raw"] == "EXTRA SPACES" + + def test_circle_price_preferred_over_sale_price(self): + raw = { + "description": "CIRCLE ITEM", + "circlePrice": 2.99, + "salePrice": 3.49, + "unitPrice": 2.99, + "totalPrice": 2.99, + } + result = _parse_item(raw) + assert result["sale_price"] == Decimal("2.99") + + def test_sale_price_fallback_when_no_circle_price(self): + raw = { + "description": "SALE ITEM", + "salePrice": 3.49, + "unitPrice": 3.49, + "totalPrice": 3.49, + } + result = _parse_item(raw) + assert result["sale_price"] == Decimal("3.49") + + def test_circle_rewards_discount(self): + raw = { + "description": "CIRCLE REWARDS ITEM", + "circleRewardsDiscount": 1.50, + "unitPrice": 5.00, + "totalPrice": 5.00, + } + result = _parse_item(raw) + assert result["loyalty_discount"] == Decimal("1.50") + + def test_circle_discount_fallback(self): + raw = { + "description": "CIRCLE DISC ITEM", + "circleDiscount": 0.75, + "unitPrice": 3.00, + "totalPrice": 3.00, + } + result = _parse_item(raw) + assert result["loyalty_discount"] == Decimal("0.75") + + def test_bogo_item(self): + raw = { + "description": "BOGO GOOD & GATHER PASTA", + "upc": "0085239300456", + "quantity": 2, + "unitPrice": 1.79, + "totalPrice": 1.79, + "regularPrice": 1.79, + "circlePrice": 0.895, + "circleRewardsDiscount": 1.79, + "promoDescription": "Buy 1 get 1 free", + "department": "GROCERY", + } + result = _parse_item(raw) + assert result["quantity"] == Decimal("2") + assert result["unit_price"] == Decimal("1.79") + assert result["extended_price"] == Decimal("1.79") + assert result["sale_price"] == Decimal("0.895") + assert result["loyalty_discount"] == Decimal("1.79") + + def test_multi_quantity_item(self): + raw = { + "description": "MARKET PANTRY EGGS", + "quantity": 2, + "unitPrice": 4.99, + "totalPrice": 9.98, + "department": "GROCERY", + } + result = _parse_item(raw) + assert result["quantity"] == Decimal("2") + assert result["unit_price"] == Decimal("4.99") + assert result["extended_price"] == Decimal("9.98") + + def test_coupon_savings_field(self): + raw = { + "description": "COUPON ITEM", + "couponSavings": 1.00, + "unitPrice": 5.00, + "totalPrice": 5.00, + } + result = _parse_item(raw) + assert result["coupon_discount"] == Decimal("1.00") + + +class TestParseTargetReceipt: + def test_full_receipt(self, target_receipt_data): + raw = RawReceipt( + receipt_id="TGT-2026-0315-7890", + purchase_date="2026-03-15T11:23:00Z", + store_number="2774", + raw_data=target_receipt_data, + ) + result = parse_target_receipt(raw) + + assert result["receipt_id"] == "TGT-2026-0315-7890" + assert result["purchase_date"] == "2026-03-15T11:23:00Z" + assert result["total"] == Decimal("83.21") + assert result["subtotal"] == Decimal("78.32") + assert result["tax"] == Decimal("4.89") + assert result["savings_total"] == Decimal("11.45") + + # Should have 8 items (voided + returned items excluded) + assert len(result["items"]) == 8 + + # Verify first item + milk = result["items"][0] + assert milk["product_name_raw"] == "GOOD & GATHER WHOLE MILK GAL" + assert milk["upc"] == "85239100123" + + def test_voided_items_excluded(self, target_receipt_data): + raw = RawReceipt( + receipt_id="TGT-2026-0315-7890", + purchase_date="2026-03-15", + raw_data=target_receipt_data, + ) + result = parse_target_receipt(raw) + + item_names = [i["product_name_raw"] for i in result["items"]] + assert "VOIDED COCA-COLA 12PK" not in item_names + + def test_returned_items_excluded(self, target_receipt_data): + raw = RawReceipt( + receipt_id="TGT-2026-0315-7890", + purchase_date="2026-03-15", + raw_data=target_receipt_data, + ) + result = parse_target_receipt(raw) + + item_names = [i["product_name_raw"] for i in result["items"]] + assert "RETURNED OLAY MOISTURIZER" not in item_names + + def test_return_flag_items_excluded(self): + data = { + "detail": { + "items": [ + { + "description": "NORMAL ITEM", + "unitPrice": 5.00, + "totalPrice": 5.00, + }, + { + "description": "RETURNED VIA FLAG", + "unitPrice": 3.00, + "totalPrice": 3.00, + "returnFlag": True, + }, + { + "description": "IS RETURN ITEM", + "unitPrice": 2.00, + "totalPrice": 2.00, + "isReturn": True, + }, + ], + "total": 5.00, + } + } + raw = RawReceipt( + receipt_id="RET-001", + purchase_date="2026-03-15", + raw_data=data, + ) + result = parse_target_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "NORMAL ITEM" + + def test_cancelled_items_excluded(self): + data = { + "detail": { + "items": [ + { + "description": "NORMAL ITEM", + "unitPrice": 5.00, + "totalPrice": 5.00, + }, + { + "description": "CANCELLED ITEM", + "unitPrice": 3.00, + "totalPrice": 3.00, + "status": "CANCELLED", + }, + ], + "total": 5.00, + } + } + raw = RawReceipt( + receipt_id="CAN-001", + purchase_date="2026-03-15", + raw_data=data, + ) + result = parse_target_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "NORMAL ITEM" + + def test_empty_receipt(self): + raw = RawReceipt( + receipt_id="EMPTY-001", + purchase_date="2026-03-15", + raw_data={"detail": {"items": [], "total": 0}}, + ) + result = parse_target_receipt(raw) + assert result["items"] == [] + assert result["total"] == Decimal("0") + + def test_receipt_with_no_detail(self): + raw = RawReceipt( + receipt_id="NO-DETAIL-001", + purchase_date="2026-03-15", + raw_data={"total": 50.00}, + ) + result = parse_target_receipt(raw) + assert result["items"] == [] + assert result["total"] == Decimal("50.00") + + def test_raw_data_preserved(self, target_receipt_data): + raw = RawReceipt( + receipt_id="TGT-2026-0315-7890", + purchase_date="2026-03-15", + raw_data=target_receipt_data, + ) + result = parse_target_receipt(raw) + assert result["raw_data"] is target_receipt_data + + def test_alternative_total_field_names(self): + raw = RawReceipt( + receipt_id="ALT-001", + purchase_date="2026-03-15", + raw_data={ + "orderTotal": 42.00, + "subTotal": 35.00, + "salesTax": 3.50, + "circleSavings": 5.00, + "detail": {"items": []}, + }, + ) + result = parse_target_receipt(raw) + assert result["total"] == Decimal("42.00") + assert result["subtotal"] == Decimal("35.00") + assert result["tax"] == Decimal("3.50") + assert result["savings_total"] == Decimal("5.00") + + def test_receipt_items_alternative_key(self): + data = { + "detail": { + "lineItems": [ + { + "description": "ALT KEY ITEM", + "unitPrice": 3.00, + "totalPrice": 3.00, + } + ], + "total": 3.00, + } + } + raw = RawReceipt( + receipt_id="ALT-KEY-001", + purchase_date="2026-03-15", + raw_data=data, + ) + result = parse_target_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "ALT KEY ITEM" + + def test_source_url_preserved(self): + raw = RawReceipt( + receipt_id="URL-001", + purchase_date="2026-03-15", + raw_data={"detail": {"items": [], "total": 0}}, + source_url="https://api.target.com/order_history/v1/orders/URL-001", + ) + result = parse_target_receipt(raw) + assert result["source_url"] == "https://api.target.com/order_history/v1/orders/URL-001" + + def test_weighted_items_in_full_receipt(self, target_receipt_data): + raw = RawReceipt( + receipt_id="TGT-2026-0315-7890", + purchase_date="2026-03-15", + raw_data=target_receipt_data, + ) + result = parse_target_receipt(raw) + + # Find the weighted turkey item + turkey = next(i for i in result["items"] if "TURKEY" in i["product_name_raw"]) + assert turkey["quantity"] == Decimal("0.72") + assert turkey["unit_price"] == Decimal("10.99") + assert turkey["extended_price"] == Decimal("7.91") + + def test_bogo_items_in_full_receipt(self, target_receipt_data): + raw = RawReceipt( + receipt_id="TGT-2026-0315-7890", + purchase_date="2026-03-15", + raw_data=target_receipt_data, + ) + result = parse_target_receipt(raw) + + # Find the BOGO pasta item + pasta = next(i for i in result["items"] if "BOGO" in i["product_name_raw"]) + assert pasta["quantity"] == Decimal("2") + assert pasta["extended_price"] == Decimal("1.79") + assert pasta["loyalty_discount"] == Decimal("1.79") + + def test_grand_total_field(self): + raw = RawReceipt( + receipt_id="GT-001", + purchase_date="2026-03-15", + raw_data={"grandTotal": 99.99, "detail": {"items": []}}, + ) + result = parse_target_receipt(raw) + assert result["total"] == Decimal("99.99") diff --git a/tests/test_pipeline/__init__.py b/tests/test_pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_pipeline/conftest.py b/tests/test_pipeline/conftest.py new file mode 100644 index 0000000..693366f --- /dev/null +++ b/tests/test_pipeline/conftest.py @@ -0,0 +1,23 @@ +"""Shared test fixtures for pipeline tests.""" + +import pytest +from cartsnitch_common.models.base import Base +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + + +@pytest.fixture +def engine(): + """In-memory SQLite engine for unit tests.""" + eng = create_engine("sqlite:///:memory:") + Base.metadata.create_all(eng) + yield eng + eng.dispose() + + +@pytest.fixture +def session(engine): + """SQLAlchemy session bound to in-memory SQLite.""" + factory = sessionmaker(bind=engine) + with factory() as sess: + yield sess diff --git a/tests/test_pipeline/test_matching.py b/tests/test_pipeline/test_matching.py new file mode 100644 index 0000000..408153c --- /dev/null +++ b/tests/test_pipeline/test_matching.py @@ -0,0 +1,161 @@ +"""Tests for product matching & dedup pipeline.""" + +import uuid +from datetime import UTC, datetime +from decimal import Decimal + +from cartsnitch_common.constants import MatchConfidence +from cartsnitch_common.models.product import NormalizedProduct +from cartsnitch_common.schemas.purchase import PurchaseItemCreate + +from receiptwitness.pipeline.matching import ( + ProductMatcher, + classify_confidence, + match_purchase_item, +) +from receiptwitness.pipeline.normalization import MatchMethod + + +class TestClassifyConfidence: + def test_upc_always_high(self): + assert classify_confidence(1.0, MatchMethod.UPC) == MatchConfidence.HIGH + assert classify_confidence(0.5, MatchMethod.UPC) == MatchConfidence.HIGH + + def test_name_high(self): + assert classify_confidence(0.9, MatchMethod.NAME) == MatchConfidence.HIGH + assert classify_confidence(0.8, MatchMethod.NAME) == MatchConfidence.HIGH + + def test_name_medium(self): + assert classify_confidence(0.6, MatchMethod.NAME) == MatchConfidence.MEDIUM + assert classify_confidence(0.5, MatchMethod.NAME) == MatchConfidence.MEDIUM + + def test_name_low(self): + assert classify_confidence(0.3, MatchMethod.NAME) == MatchConfidence.LOW + assert classify_confidence(0.0, MatchMethod.NAME) == MatchConfidence.LOW + + +class TestProductMatcher: + def _make_item(self, name: str, upc: str | None = None) -> PurchaseItemCreate: + return PurchaseItemCreate( + product_name_raw=name, + upc=upc, + unit_price=Decimal("3.99"), + extended_price=Decimal("3.99"), + ) + + def test_match_by_upc(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Whole Milk Gallon", + upc_variants=["041250000001"], + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + + matcher = ProductMatcher(session) + item = self._make_item("Kroger Milk", upc="041250000001") + prod, result, confidence = matcher.match_single(item) + + assert prod is not None + assert prod.id == product.id + assert result is not None + assert result.method == MatchMethod.UPC + assert confidence == MatchConfidence.HIGH + + def test_match_by_name(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Whole Milk Gallon", + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + + matcher = ProductMatcher(session, name_threshold=0.3) + item = self._make_item("Whole Milk Gallon Size") + prod, result, confidence = matcher.match_single(item) + + assert prod is not None + assert result is not None + assert result.method == MatchMethod.NAME + + def test_auto_create_when_no_match(self, session): + matcher = ProductMatcher(session, auto_create=True) + item = self._make_item("Unique Product XYZ 16 oz") + prod, result, confidence = matcher.match_single(item) + + assert prod is not None + assert result is None # No match found, was created + assert confidence == MatchConfidence.LOW + assert prod.canonical_name == "Unique Product XYZ 16 oz" + assert prod.size == "16" + assert prod.size_unit == "oz" + + def test_no_create_when_disabled(self, session): + matcher = ProductMatcher(session, auto_create=False) + item = self._make_item("Nonexistent Product") + prod, result, confidence = matcher.match_single(item) + + assert prod is None + assert result is None + + def test_batch_match(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Large Eggs 12 Count", + upc_variants=["012345"], + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + + matcher = ProductMatcher(session) + items = [ + self._make_item("Large Eggs", upc="012345"), + self._make_item("Brand New Never Seen Product"), + ] + outcomes = matcher.match_items(items) + + assert len(outcomes) == 2 + assert outcomes[0].match is not None + assert outcomes[0].confidence_level == MatchConfidence.HIGH + assert outcomes[0].created_new is False + assert outcomes[1].match is None + assert outcomes[1].created_new is True + + +class TestMatchPurchaseItem: + def test_convenience_function(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Ground Beef 80/20", + upc_variants=["999888"], + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + + item = PurchaseItemCreate( + product_name_raw="Ground Beef", + upc="999888", + unit_price=Decimal("5.99"), + extended_price=Decimal("5.99"), + ) + prod, confidence = match_purchase_item(session, item) + assert prod is not None + assert confidence == MatchConfidence.HIGH + + def test_auto_create_default(self, session): + item = PurchaseItemCreate( + product_name_raw="Totally New Item", + unit_price=Decimal("1.00"), + extended_price=Decimal("1.00"), + ) + prod, confidence = match_purchase_item(session, item) + assert prod is not None + assert confidence == MatchConfidence.LOW diff --git a/tests/test_pipeline/test_normalization.py b/tests/test_pipeline/test_normalization.py new file mode 100644 index 0000000..de1d566 --- /dev/null +++ b/tests/test_pipeline/test_normalization.py @@ -0,0 +1,158 @@ +"""Tests for product normalization module.""" + +import uuid +from datetime import UTC, datetime + +from cartsnitch_common.models.product import NormalizedProduct + +from receiptwitness.pipeline.normalization import ( + MatchMethod, + clean_name, + extract_size_info, + jaccard_similarity, + match_by_name, + match_by_upc, + normalize_product, +) + + +class TestCleanName: + def test_lowercase(self): + assert clean_name("Kroger WHOLE MILK") == "kroger whole milk" + + def test_removes_size_info(self): + assert "oz" not in clean_name("Milk 16 oz Whole") + + def test_removes_noise_words(self): + cleaned = clean_name("The Original Brand Milk") + assert "the" not in cleaned.split() + assert "original" not in cleaned.split() + assert "brand" not in cleaned.split() + + def test_collapses_whitespace(self): + assert " " not in clean_name("Milk Whole Gallon") + + def test_removes_punctuation(self): + cleaned = clean_name("Meijer's Best (Organic) Milk!") + assert "'" not in cleaned + assert "(" not in cleaned + + +class TestExtractSizeInfo: + def test_extracts_oz(self): + result = extract_size_info("Cereal 18 oz box") + assert result == ("18", "oz") + + def test_extracts_fl_oz(self): + result = extract_size_info("Juice 64 fl oz") + assert result == ("64", "fl_oz") + + def test_extracts_lb(self): + result = extract_size_info("Ground Beef 1.5 lb") + assert result == ("1.5", "lb") + + def test_extracts_ct(self): + result = extract_size_info("Eggs Large 12 ct") + assert result == ("12", "ct") + + def test_no_size_returns_none(self): + assert extract_size_info("Bananas") is None + + +class TestJaccardSimilarity: + def test_identical_strings(self): + assert jaccard_similarity("whole milk gallon", "whole milk gallon") == 1.0 + + def test_completely_different(self): + assert jaccard_similarity("apple juice", "ground beef") == 0.0 + + def test_partial_overlap(self): + score = jaccard_similarity("kroger whole milk", "meijer whole milk") + assert 0.4 < score < 0.8 # "whole" and "milk" overlap + + def test_empty_strings(self): + assert jaccard_similarity("", "") == 0.0 + assert jaccard_similarity("milk", "") == 0.0 + + +class TestMatchByUPC: + def test_match_found(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Whole Milk, Gallon", + upc_variants=["0041250000001", "0041250000002"], + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + # SQLite doesn't support JSONB containment — this will raise + # In production (PostgreSQL), this would work + result = match_by_upc(session, "0041250000001") + assert result is not None + assert result.method == MatchMethod.UPC + assert result.confidence == 1.0 + + def test_no_match(self, session): + result = match_by_upc(session, "9999999999999") + assert result is None + + +class TestMatchByName: + def test_exact_name_match(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Whole Milk, Gallon", + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + result = match_by_name(session, "Whole Milk Gallon") + assert result is not None + assert result.method == MatchMethod.NAME + assert result.confidence > 0.5 + + def test_fuzzy_match(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Kroger Whole Milk, 1 Gallon", + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + result = match_by_name(session, "Meijer Whole Milk 1 Gallon", threshold=0.3) + assert result is not None + assert result.confidence > 0.3 + + def test_no_match_below_threshold(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Ground Beef 80/20", + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + result = match_by_name(session, "Apple Juice 64 oz", threshold=0.5) + assert result is None + + +class TestNormalizeProduct: + def test_name_fallback(self, session): + product = NormalizedProduct( + id=uuid.uuid4(), + canonical_name="Large Eggs, 12 count", + created_at=datetime.now(UTC), + updated_at=datetime.now(UTC), + ) + session.add(product) + session.commit() + result = normalize_product(session, "Large Eggs 12 ct", upc=None) + assert result is not None + assert result.method == MatchMethod.NAME + + def test_no_match(self, session): + result = normalize_product(session, "Nonexistent Product XYZ", upc=None) + assert result is None diff --git a/tests/test_pipeline/test_receipt.py b/tests/test_pipeline/test_receipt.py new file mode 100644 index 0000000..8210713 --- /dev/null +++ b/tests/test_pipeline/test_receipt.py @@ -0,0 +1,204 @@ +"""Tests for receipt normalization pipeline.""" + +import uuid +from datetime import date +from decimal import Decimal + +from receiptwitness.pipeline.receipt import ( + _clean_product_name, + _safe_decimal, + normalize_receipt, + parse_meijer_item, +) + + +class TestCleanProductName: + def test_strips_whitespace(self): + assert _clean_product_name(" Milk ") == "Milk" + + def test_removes_leading_punctuation(self): + assert _clean_product_name("---Milk---") == "Milk" + + def test_collapses_internal_whitespace(self): + assert _clean_product_name("Whole Milk Gallon") == "Whole Milk Gallon" + + def test_empty_string(self): + assert _clean_product_name("") == "" + + +class TestSafeDecimal: + def test_string_input(self): + assert _safe_decimal("3.99") == Decimal("3.99") + + def test_float_input(self): + assert _safe_decimal(3.99) == Decimal("3.99") + + def test_int_input(self): + assert _safe_decimal(4) == Decimal("4") + + def test_none_returns_default(self): + assert _safe_decimal(None) == Decimal("0") + + def test_none_custom_default(self): + assert _safe_decimal(None, Decimal("1")) == Decimal("1") + + def test_invalid_returns_default(self): + assert _safe_decimal("not-a-number") == Decimal("0") + + def test_decimal_passthrough(self): + assert _safe_decimal(Decimal("5.50")) == Decimal("5.50") + + +class TestParseMeijerItem: + def test_basic_item(self): + raw = { + "description": "Kroger Whole Milk 1 Gallon", + "upc": "0041250000001", + "quantity": 1, + "unitPrice": "3.99", + "extendedPrice": "3.99", + "category": "DAIRY", + } + item = parse_meijer_item(raw) + assert item.product_name_raw == "Kroger Whole Milk 1 Gallon" + assert item.upc == "41250000001" # leading zeros stripped + assert item.quantity == Decimal("1") + assert item.unit_price == Decimal("3.99") + assert item.extended_price == Decimal("3.99") + assert item.category_raw == "DAIRY" + + def test_alternate_field_names(self): + raw = { + "name": "Eggs Large 12 ct", + "upcCode": "012345", + "qty": 2, + "price": "4.50", + "totalPrice": "9.00", + "department": "EGGS", + } + item = parse_meijer_item(raw) + assert item.product_name_raw == "Eggs Large 12 ct" + assert item.upc == "12345" + assert item.quantity == Decimal("2") + assert item.unit_price == Decimal("4.50") + assert item.extended_price == Decimal("9.00") + assert item.category_raw == "EGGS" + + def test_calculates_extended_from_unit_price(self): + raw = { + "description": "Bananas", + "unitPrice": "0.59", + "quantity": 3, + } + item = parse_meijer_item(raw) + assert item.extended_price == Decimal("1.77") + + def test_discounts_parsed(self): + raw = { + "description": "Cereal", + "unitPrice": "4.99", + "extendedPrice": "4.99", + "regularPrice": "5.99", + "salePrice": "4.99", + "couponAmount": "1.00", + "loyaltyAmount": "0.50", + } + item = parse_meijer_item(raw) + assert item.regular_price == Decimal("5.99") + assert item.sale_price == Decimal("4.99") + assert item.coupon_discount == Decimal("1.00") + assert item.loyalty_discount == Decimal("0.50") + + def test_alternate_discount_names(self): + raw = { + "description": "Bread", + "unitPrice": "2.99", + "extendedPrice": "2.99", + "couponDiscount": "0.75", + "loyaltyDiscount": "0.25", + } + item = parse_meijer_item(raw) + assert item.coupon_discount == Decimal("0.75") + assert item.loyalty_discount == Decimal("0.25") + + def test_missing_fields_default_gracefully(self): + raw = {"description": "Mystery Item"} + item = parse_meijer_item(raw) + assert item.product_name_raw == "Mystery Item" + assert item.upc is None + assert item.quantity == Decimal("1") + assert item.unit_price == Decimal("0") + assert item.regular_price is None + assert item.category_raw is None + + def test_no_upc_returns_none(self): + raw = {"description": "Loose Bananas", "unitPrice": "1.00", "extendedPrice": "1.00"} + item = parse_meijer_item(raw) + assert item.upc is None + + +class TestNormalizeReceipt: + def test_full_receipt(self): + user_id = str(uuid.uuid4()) + store_id = str(uuid.uuid4()) + raw = { + "receiptId": "REC-001", + "date": "2026-03-15", + "total": "25.47", + "subtotal": "23.00", + "tax": "2.47", + "savings": "3.00", + "items": [ + {"description": "Milk", "unitPrice": "3.99", "extendedPrice": "3.99"}, + {"description": "Bread", "unitPrice": "2.50", "extendedPrice": "2.50"}, + ], + } + purchase = normalize_receipt(raw, user_id, store_id) + assert purchase.receipt_id == "REC-001" + assert purchase.purchase_date == date(2026, 3, 15) + assert purchase.total == Decimal("25.47") + assert purchase.subtotal == Decimal("23.00") + assert purchase.tax == Decimal("2.47") + assert purchase.savings_total == Decimal("3.00") + assert len(purchase.items) == 2 + assert purchase.items[0].product_name_raw == "Milk" + assert purchase.raw_data == raw + + def test_alternate_receipt_fields(self): + user_id = str(uuid.uuid4()) + store_id = str(uuid.uuid4()) + raw = { + "receipt_id": "REC-002", + "purchaseDate": "2026-03-14", + "totalAmount": "10.00", + "taxAmount": "0.75", + "totalSavings": "1.50", + "items": [], + } + purchase = normalize_receipt(raw, user_id, store_id) + assert purchase.receipt_id == "REC-002" + assert purchase.purchase_date == date(2026, 3, 14) + assert purchase.total == Decimal("10.00") + assert purchase.tax == Decimal("0.75") + assert purchase.savings_total == Decimal("1.50") + + def test_missing_date_defaults_to_today(self): + user_id = str(uuid.uuid4()) + store_id = str(uuid.uuid4()) + raw = {"total": "5.00", "items": []} + purchase = normalize_receipt(raw, user_id, store_id) + assert purchase.purchase_date == date.today() + + def test_generates_receipt_id_if_missing(self): + user_id = str(uuid.uuid4()) + store_id = str(uuid.uuid4()) + raw = {"total": "5.00", "date": "2026-03-15", "items": []} + purchase = normalize_receipt(raw, user_id, store_id) + assert purchase.receipt_id # Should be a generated UUID string + + def test_date_object_passthrough(self): + user_id = str(uuid.uuid4()) + store_id = str(uuid.uuid4()) + raw = {"date": date(2026, 1, 1), "total": "5.00", "items": []} + purchase = normalize_receipt(raw, user_id, store_id) + assert purchase.purchase_date == date(2026, 1, 1) diff --git a/tests/test_regression/__init__.py b/tests/test_regression/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_regression/test_layout_changes.py b/tests/test_regression/test_layout_changes.py new file mode 100644 index 0000000..7843c43 --- /dev/null +++ b/tests/test_regression/test_layout_changes.py @@ -0,0 +1,435 @@ +"""Regression tests: graceful handling of page layout changes. + +Retailers frequently change their API response structures, field names, +and nesting. These tests verify that both parsers degrade gracefully when +encountering alternative or missing fields — producing valid output +instead of crashing. +""" + +from decimal import Decimal + +from receiptwitness.parsers.kroger import parse_kroger_receipt +from receiptwitness.parsers.meijer import parse_meijer_receipt +from receiptwitness.scrapers.base import RawReceipt + + +class TestKrogerFieldNameVariations: + """Kroger changes field names between app versions and API revisions.""" + + def test_alternative_item_key_line_items(self): + raw = RawReceipt( + receipt_id="KR-ALT-1", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "lineItems": [{"description": "MILK", "basePrice": 3.99, "totalPrice": 3.99}], + "total": 3.99, + } + }, + ) + result = parse_kroger_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "MILK" + + def test_alternative_item_key_receipt_items(self): + raw = RawReceipt( + receipt_id="KR-ALT-2", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "receiptItems": [ + {"description": "EGGS", "basePrice": 5.49, "totalPrice": 5.49} + ], + "total": 5.49, + } + }, + ) + result = parse_kroger_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "EGGS" + + def test_alternative_description_fields(self): + """Test productName and itemDescription fallbacks.""" + for field in ("productName", "itemDescription", "name"): + raw = RawReceipt( + receipt_id="KR-DESC", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [{field: "TEST PRODUCT", "basePrice": 1.00, "totalPrice": 1.00}], + "total": 1.00, + } + }, + ) + result = parse_kroger_receipt(raw) + assert result["items"][0]["product_name_raw"] == "TEST PRODUCT" + + def test_alternative_price_fields(self): + """Test unitPrice and price fallbacks for basePrice.""" + raw = RawReceipt( + receipt_id="KR-PRICE-1", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [{"description": "ITEM A", "unitPrice": 2.50, "totalPrice": 2.50}], + "total": 2.50, + } + }, + ) + result = parse_kroger_receipt(raw) + assert result["items"][0]["unit_price"] == Decimal("2.50") + + raw2 = RawReceipt( + receipt_id="KR-PRICE-2", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [{"description": "ITEM B", "price": 4.00, "totalPrice": 4.00}], + "total": 4.00, + } + }, + ) + result2 = parse_kroger_receipt(raw2) + assert result2["items"][0]["unit_price"] == Decimal("4.00") + + def test_alternative_total_fields(self): + """Test orderTotal, grandTotal fallbacks.""" + for field in ("orderTotal", "grandTotal"): + raw = RawReceipt( + receipt_id="KR-TOT", + purchase_date="2026-03-12", + raw_data={field: 42.50, "detail": {}}, + ) + result = parse_kroger_receipt(raw) + assert result["total"] == Decimal("42.50") + + def test_alternative_savings_fields(self): + """Test youSaved and totalDiscount fallbacks.""" + raw = RawReceipt( + receipt_id="KR-SAV-1", + purchase_date="2026-03-12", + raw_data={"youSaved": 5.00, "detail": {}}, + ) + result = parse_kroger_receipt(raw) + assert result["savings_total"] == Decimal("5.00") + + def test_alternative_tax_field(self): + raw = RawReceipt( + receipt_id="KR-TAX", + purchase_date="2026-03-12", + raw_data={"salesTax": 3.25, "detail": {}}, + ) + result = parse_kroger_receipt(raw) + assert result["tax"] == Decimal("3.25") + + def test_alternative_quantity_field_qty(self): + raw = RawReceipt( + receipt_id="KR-QTY", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [ + {"description": "APPLES", "qty": 5, "basePrice": 1.00, "totalPrice": 5.00} + ], + "total": 5.00, + } + }, + ) + result = parse_kroger_receipt(raw) + assert result["items"][0]["quantity"] == Decimal("5") + + def test_alternative_upc_field_kroger_product_id(self): + raw = RawReceipt( + receipt_id="KR-UPC", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [ + { + "description": "ITEM", + "krogerProductId": "12345678", + "basePrice": 1.00, + "totalPrice": 1.00, + } + ], + "total": 1.00, + } + }, + ) + result = parse_kroger_receipt(raw) + assert result["items"][0]["upc"] == "12345678" + + def test_missing_extended_price_computed(self): + """When totalPrice is missing, extended_price = unit_price * quantity.""" + raw = RawReceipt( + receipt_id="KR-CALC", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [{"description": "EGGS", "basePrice": 5.49, "quantity": 2}], + "total": 10.98, + } + }, + ) + result = parse_kroger_receipt(raw) + assert result["items"][0]["extended_price"] == Decimal("5.49") * Decimal("2") + + +class TestMeijerFieldNameVariations: + """Meijer XHR endpoints may change field names between SPA versions.""" + + def test_alternative_item_key_line_items(self): + raw = RawReceipt( + receipt_id="MJ-ALT-1", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "lineItems": [{"description": "BANANAS", "price": 0.69, "extendedPrice": 0.69}], + "total": 0.69, + } + }, + ) + result = parse_meijer_receipt(raw) + assert len(result["items"]) == 1 + assert result["items"][0]["product_name_raw"] == "BANANAS" + + def test_alternative_description_fields(self): + for field in ("itemDescription", "name"): + raw = RawReceipt( + receipt_id="MJ-DESC", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [{field: "TEST ITEM", "price": 1.00, "extendedPrice": 1.00}], + "total": 1.00, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["product_name_raw"] == "TEST ITEM" + + def test_alternative_price_field_unit_price(self): + raw = RawReceipt( + receipt_id="MJ-PRICE", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [{"description": "MILK", "unitPrice": 3.49, "totalPrice": 3.49}], + "total": 3.49, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["unit_price"] == Decimal("3.49") + + def test_alternative_extended_price_field_total_price(self): + raw = RawReceipt( + receipt_id="MJ-EXT", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [{"description": "CEREAL", "price": 4.99, "totalPrice": 4.99}], + "total": 4.99, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["extended_price"] == Decimal("4.99") + + def test_alternative_total_field_transaction_total(self): + raw = RawReceipt( + receipt_id="MJ-TOT", + purchase_date="2026-03-10", + raw_data={"transactionTotal": 55.00, "detail": {}}, + ) + result = parse_meijer_receipt(raw) + assert result["total"] == Decimal("55.00") + + def test_alternative_loyalty_field(self): + raw = RawReceipt( + receipt_id="MJ-LOY", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [ + { + "description": "ITEM", + "price": 5.00, + "extendedPrice": 5.00, + "loyaltyDiscount": 0.50, + } + ], + "total": 5.00, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["loyalty_discount"] == Decimal("0.50") + + def test_alternative_upc_field_uppercase(self): + raw = RawReceipt( + receipt_id="MJ-UPC", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [ + { + "description": "ITEM", + "UPC": "0012345678", + "price": 1.00, + "extendedPrice": 1.00, + } + ], + "total": 1.00, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["upc"] == "12345678" + + def test_alternative_category_field(self): + raw = RawReceipt( + receipt_id="MJ-CAT", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [ + { + "description": "ITEM", + "price": 1.00, + "extendedPrice": 1.00, + "departmentDescription": "FROZEN", + } + ], + "total": 1.00, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["category_raw"] == "FROZEN" + + def test_missing_extended_price_computed(self): + raw = RawReceipt( + receipt_id="MJ-CALC", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [{"description": "MILK", "price": 3.49, "quantity": 2}], + "total": 6.98, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["extended_price"] == Decimal("3.49") * Decimal("2") + + def test_missing_description_fallback(self): + raw = RawReceipt( + receipt_id="MJ-NODESC", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [{"price": 1.00, "extendedPrice": 1.00}], + "total": 1.00, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["product_name_raw"] == "UNKNOWN ITEM" + + +class TestMixedFieldVersions: + """Test receipts that mix field naming conventions (happens during rollouts).""" + + def test_kroger_mixed_item_fields(self): + """Some items use old names, some use new names in same receipt.""" + raw = RawReceipt( + receipt_id="KR-MIX", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [ + {"description": "OLD STYLE", "basePrice": 2.00, "totalPrice": 2.00}, + {"productName": "NEW STYLE", "unitPrice": 3.00, "extendedAmount": 3.00}, + ], + "total": 5.00, + } + }, + ) + result = parse_kroger_receipt(raw) + assert len(result["items"]) == 2 + assert result["items"][0]["product_name_raw"] == "OLD STYLE" + assert result["items"][0]["unit_price"] == Decimal("2.00") + assert result["items"][1]["product_name_raw"] == "NEW STYLE" + assert result["items"][1]["unit_price"] == Decimal("3.00") + + def test_kroger_completely_unknown_structure_no_crash(self): + """Receipt with unrecognized structure should return empty items.""" + raw = RawReceipt( + receipt_id="KR-UNKNOWN", + purchase_date="2026-03-12", + raw_data={"something_unexpected": [1, 2, 3], "detail": {"foo": "bar"}}, + ) + result = parse_kroger_receipt(raw) + assert result["receipt_id"] == "KR-UNKNOWN" + assert result["items"] == [] + + def test_meijer_completely_unknown_structure_no_crash(self): + raw = RawReceipt( + receipt_id="MJ-UNKNOWN", + purchase_date="2026-03-10", + raw_data={"something_unexpected": [1, 2, 3], "detail": {"foo": "bar"}}, + ) + result = parse_meijer_receipt(raw) + assert result["receipt_id"] == "MJ-UNKNOWN" + assert result["items"] == [] + + def test_kroger_null_fields_no_crash(self): + """Fields with None values should be handled gracefully.""" + raw = RawReceipt( + receipt_id="KR-NULL", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [ + { + "description": "ITEM", + "basePrice": None, + "totalPrice": None, + "quantity": None, + "upc": None, + "department": None, + } + ], + "total": None, + "subtotal": None, + "tax": None, + } + }, + ) + result = parse_kroger_receipt(raw) + assert result["items"][0]["product_name_raw"] == "ITEM" + assert result["items"][0]["unit_price"] == Decimal("0") + + def test_meijer_null_fields_no_crash(self): + raw = RawReceipt( + receipt_id="MJ-NULL", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [ + { + "description": "ITEM", + "price": None, + "extendedPrice": None, + "quantity": None, + "upc": None, + "category": None, + } + ], + "total": None, + } + }, + ) + result = parse_meijer_receipt(raw) + assert result["items"][0]["product_name_raw"] == "ITEM" + assert result["items"][0]["unit_price"] == Decimal("0") diff --git a/tests/test_regression/test_rate_limiting.py b/tests/test_regression/test_rate_limiting.py new file mode 100644 index 0000000..1c55495 --- /dev/null +++ b/tests/test_regression/test_rate_limiting.py @@ -0,0 +1,365 @@ +"""Regression tests: rate limiting and retry behavior. + +Validates that scrapers enforce human-like delays between requests +and handle rate-limit/error responses gracefully without infinite retries. +""" + +from datetime import UTC, datetime, timedelta +from unittest.mock import AsyncMock, patch + +import pytest + +from receiptwitness.scrapers.base import SessionData +from receiptwitness.scrapers.kroger import DEFAULT_USER_AGENT, KrogerScraper +from receiptwitness.scrapers.meijer import MeijerScraper + + +class TestHumanDelayBehavior: + """Verify that human_delay respects configured bounds.""" + + @pytest.mark.asyncio + async def test_delay_within_bounds(self): + """human_delay should sleep between min_ms/1000 and max_ms/1000 seconds.""" + scraper = KrogerScraper() + sleep_path = "receiptwitness.scrapers.base.asyncio.sleep" + with patch(sleep_path, new_callable=AsyncMock) as mock_sleep: + await scraper.human_delay(100, 200) + mock_sleep.assert_called_once() + delay = mock_sleep.call_args[0][0] + assert 0.1 <= delay <= 0.2 + + @pytest.mark.asyncio + async def test_delay_uses_settings_defaults(self): + """Without explicit args, should use settings.min/max_request_delay_ms.""" + scraper = MeijerScraper() + sleep_path = "receiptwitness.scrapers.base.asyncio.sleep" + with ( + patch("receiptwitness.scrapers.base.settings") as mock_settings, + patch(sleep_path, new_callable=AsyncMock) as mock_sleep, + ): + mock_settings.min_request_delay_ms = 1000 + mock_settings.max_request_delay_ms = 5000 + await scraper.human_delay() + mock_sleep.assert_called_once() + delay = mock_sleep.call_args[0][0] + assert 1.0 <= delay <= 5.0 + + @pytest.mark.asyncio + async def test_delay_is_randomized(self): + """Multiple calls should produce different delays (probabilistic).""" + scraper = KrogerScraper() + delays = [] + sleep_path2 = "receiptwitness.scrapers.base.asyncio.sleep" + with patch(sleep_path2, new_callable=AsyncMock) as mock_sleep: + for _ in range(20): + await scraper.human_delay(100, 5000) + delays.append(mock_sleep.call_args[0][0]) + # With range 100-5000ms, 20 calls should have at least 2 distinct values + assert len(set(delays)) >= 2 + + +class TestKrogerRateLimiting: + """Verify Kroger scraper calls human_delay between receipt fetches.""" + + @pytest.mark.asyncio + async def test_delay_called_between_receipts(self): + """Scraper must call human_delay for each receipt detail fetch.""" + scraper = KrogerScraper() + valid_session = SessionData( + cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=datetime.now(UTC) + timedelta(hours=2), + ) + + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "orders": [ + { + "orderId": f"KR-{i}", + "purchaseDate": "2026-03-10T14:00:00Z", + "storeNumber": "357", + } + for i in range(3) + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response] + [mock_detail_response] * 3) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock) as mock_delay, + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + + assert len(receipts) == 3 + # human_delay called at least once per receipt (after initial page nav) + # Plus once for the initial navigation delay + assert mock_delay.call_count >= 3 + + +class TestMeijerRateLimiting: + """Verify Meijer scraper calls human_delay between receipt fetches.""" + + @pytest.mark.asyncio + async def test_delay_called_between_receipts(self): + scraper = MeijerScraper() + valid_session = SessionData( + cookies=[{"name": "s", "value": "v", "domain": ".meijer.com", "path": "/"}], + user_agent="test", + created_at=datetime.now(UTC), + expires_at=datetime.now(UTC) + timedelta(hours=4), + ) + + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "transactions": [ + { + "transactionId": f"TXN-{i}", + "transactionDate": "2026-03-10T14:00:00Z", + "storeNumber": "42", + } + for i in range(3) + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response] + [mock_detail_response] * 3) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock) as mock_delay, + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + + assert len(receipts) == 3 + assert mock_delay.call_count >= 3 + + +class TestGracefulErrorRecovery: + """Scrapers should not retry endlessly on errors.""" + + @pytest.mark.asyncio + async def test_kroger_api_500_returns_empty_not_retry(self): + """500 error should return empty list, not retry.""" + scraper = KrogerScraper() + valid_session = SessionData( + cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=datetime.now(UTC) + timedelta(hours=2), + ) + + mock_api_response = AsyncMock() + mock_api_response.ok = False + mock_api_response.status = 500 + mock_api_response.status_text = "Internal Server Error" + + mock_request = AsyncMock() + mock_request.get = AsyncMock(return_value=mock_api_response) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert receipts == [] + # Should only call the API once — no retries + assert mock_request.get.call_count == 1 + + @pytest.mark.asyncio + async def test_kroger_429_returns_empty_not_retry(self): + """Rate limit (429) should return empty, not retry.""" + scraper = KrogerScraper() + valid_session = SessionData( + cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=datetime.now(UTC) + timedelta(hours=2), + ) + + mock_api_response = AsyncMock() + mock_api_response.ok = False + mock_api_response.status = 429 + mock_api_response.status_text = "Too Many Requests" + + mock_request = AsyncMock() + mock_request.get = AsyncMock(return_value=mock_api_response) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert receipts == [] + assert mock_request.get.call_count == 1 + + @pytest.mark.asyncio + async def test_meijer_detail_exception_continues(self): + """Exception fetching one receipt detail should not abort remaining receipts.""" + scraper = MeijerScraper() + valid_session = SessionData( + cookies=[{"name": "s", "value": "v", "domain": ".meijer.com", "path": "/"}], + user_agent="test", + created_at=datetime.now(UTC), + expires_at=datetime.now(UTC) + timedelta(hours=4), + ) + + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "transactions": [ + { + "transactionId": "TXN-1", + "transactionDate": "2026-03-10T14:00:00Z", + "storeNumber": "42", + }, + { + "transactionId": "TXN-2", + "transactionDate": "2026-03-11T10:00:00Z", + "storeNumber": "42", + }, + ] + } + ) + + # First detail call raises exception, second succeeds + mock_detail_fail = AsyncMock() + mock_detail_fail.ok = False + mock_detail_fail.status = 500 + + mock_detail_ok = AsyncMock() + mock_detail_ok.ok = True + mock_detail_ok.json = AsyncMock(return_value={"items": []}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock( + side_effect=[mock_api_response, mock_detail_fail, mock_detail_ok] + ) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + + # Both receipts should be returned — the first with empty detail + assert len(receipts) == 2 + assert receipts[0].raw_data.get("detail") == {} + assert receipts[1].receipt_id == "TXN-2" diff --git a/tests/test_regression/test_schema_validation.py b/tests/test_regression/test_schema_validation.py new file mode 100644 index 0000000..8dfb10e --- /dev/null +++ b/tests/test_regression/test_schema_validation.py @@ -0,0 +1,364 @@ +"""Regression tests: scraper output matches expected schema. + +Validates that parsed receipts from both Kroger and Meijer conform to the +PurchaseCreate schema contract. Uses recorded fixtures to ensure outputs +remain stable across code changes. +""" + +from decimal import Decimal + +from receiptwitness.parsers.kroger import parse_kroger_receipt +from receiptwitness.parsers.meijer import parse_meijer_receipt +from receiptwitness.scrapers.base import RawReceipt + +# Required top-level keys in a parsed receipt +RECEIPT_REQUIRED_KEYS = {"receipt_id", "purchase_date", "total", "items", "raw_data"} +RECEIPT_OPTIONAL_KEYS = {"subtotal", "tax", "savings_total", "source_url"} + +# Required keys in each parsed item +ITEM_REQUIRED_KEYS = { + "product_name_raw", + "upc", + "quantity", + "unit_price", + "extended_price", +} +ITEM_OPTIONAL_KEYS = { + "regular_price", + "sale_price", + "coupon_discount", + "loyalty_discount", + "category_raw", +} + + +def _validate_receipt_schema(result: dict) -> None: + """Assert that a parsed receipt dict conforms to the expected schema.""" + # All required keys present + for key in RECEIPT_REQUIRED_KEYS: + assert key in result, f"Missing required key: {key}" + + # Types + assert isinstance(result["receipt_id"], str) + assert isinstance(result["purchase_date"], str) + assert isinstance(result["total"], Decimal) + assert isinstance(result["items"], list) + assert isinstance(result["raw_data"], dict) + + # Optional keys should be correct types when present + if result.get("subtotal") is not None: + assert isinstance(result["subtotal"], Decimal) + if result.get("tax") is not None: + assert isinstance(result["tax"], Decimal) + if result.get("savings_total") is not None: + assert isinstance(result["savings_total"], Decimal) + if result.get("source_url") is not None: + assert isinstance(result["source_url"], str) + + # No unexpected keys + all_keys = RECEIPT_REQUIRED_KEYS | RECEIPT_OPTIONAL_KEYS + for key in result: + assert key in all_keys, f"Unexpected key in receipt: {key}" + + +def _validate_item_schema(item: dict) -> None: + """Assert that a parsed item dict conforms to the expected schema.""" + for key in ITEM_REQUIRED_KEYS: + assert key in item, f"Missing required item key: {key}" + + assert isinstance(item["product_name_raw"], str) + assert len(item["product_name_raw"]) > 0 + assert isinstance(item["quantity"], Decimal) + assert isinstance(item["unit_price"], Decimal) + assert isinstance(item["extended_price"], Decimal) + + # UPC can be None or str + if item["upc"] is not None: + assert isinstance(item["upc"], str) + # UPC should not have leading zeros (stripped during parsing) + assert not item["upc"].startswith("0"), f"UPC has leading zeros: {item['upc']}" + + # Optional Decimal fields + for opt_key in ("regular_price", "sale_price", "coupon_discount", "loyalty_discount"): + if item.get(opt_key) is not None: + assert isinstance(item[opt_key], Decimal), f"{opt_key} should be Decimal" + + if item.get("category_raw") is not None: + assert isinstance(item["category_raw"], str) + + # No unexpected keys + all_keys = ITEM_REQUIRED_KEYS | ITEM_OPTIONAL_KEYS + for key in item: + assert key in all_keys, f"Unexpected key in item: {key}" + + +class TestKrogerSchemaValidation: + def test_full_receipt_schema(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + store_number="00357", + raw_data=kroger_receipt_data, + source_url="https://www.kroger.com/atlas/v1/receipt/api?orderId=KR-2026-0312-4471", + ) + result = parse_kroger_receipt(raw) + _validate_receipt_schema(result) + for item in result["items"]: + _validate_item_schema(item) + + def test_item_count_excludes_voided_and_returned(self, kroger_receipt_data): + """Fixture has 10 items, 2 should be excluded (voided + returned).""" + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + assert len(result["items"]) == 8 + + def test_totals_are_positive_decimals(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + assert result["total"] > Decimal("0") + assert result["subtotal"] > Decimal("0") + assert result["tax"] > Decimal("0") + assert result["savings_total"] > Decimal("0") + + def test_receipt_id_preserved(self, kroger_receipt_data): + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + assert result["receipt_id"] == "KR-2026-0312-4471" + + def test_known_product_prices(self, kroger_receipt_data): + """Verify specific products produce correct price extraction.""" + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + items_by_name = {i["product_name_raw"]: i for i in result["items"]} + + # Milk: $3.99, regular $4.29 + milk = items_by_name["KROGER WHOLE MILK GAL"] + assert milk["unit_price"] == Decimal("3.99") + assert milk["regular_price"] == Decimal("4.29") + assert milk["sale_price"] == Decimal("3.99") + + # Eggs: qty 2, $5.49 each, total $10.98 + eggs = items_by_name["SIMPLE TRUTH ORG EGGS 12CT"] + assert eggs["quantity"] == Decimal("2") + assert eggs["unit_price"] == Decimal("5.49") + assert eggs["extended_price"] == Decimal("10.98") + + # Deli turkey: weighted item, 0.68 lb + turkey = items_by_name["KROGER DELI TURKEY BREAST"] + assert turkey["quantity"] == Decimal("0.68") + assert turkey["upc"] is None + + def test_multi_quantity_item_correct(self, kroger_receipt_data): + """Pasta is qty=3, unit=$2.49, total=$7.47.""" + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + pasta = [i for i in result["items"] if "PASTA" in i["product_name_raw"]][0] + assert pasta["quantity"] == Decimal("3") + assert pasta["unit_price"] == Decimal("2.49") + assert pasta["extended_price"] == Decimal("7.47") + + def test_coupon_discount_captured(self, kroger_receipt_data): + """Tide Pods has $2.00 coupon.""" + raw = RawReceipt( + receipt_id="KR-2026-0312-4471", + purchase_date="2026-03-12T16:45:00Z", + raw_data=kroger_receipt_data, + ) + result = parse_kroger_receipt(raw) + tide = [i for i in result["items"] if "TIDE" in i["product_name_raw"]][0] + assert tide["coupon_discount"] == Decimal("2.00") + + +class TestMeijerSchemaValidation: + def test_full_receipt_schema(self, meijer_receipt_data): + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + store_number="42", + raw_data=meijer_receipt_data, + source_url="https://www.meijer.com/bin/meijer/profile/receipt?receiptId=TXN-2026-0310-001", + ) + result = parse_meijer_receipt(raw) + _validate_receipt_schema(result) + for item in result["items"]: + _validate_item_schema(item) + + def test_item_count_excludes_voided(self, meijer_receipt_data): + """Fixture has 6 items, 1 should be excluded (voided soda).""" + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + assert len(result["items"]) == 5 + + def test_totals_are_positive_decimals(self, meijer_receipt_data): + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + assert result["total"] > Decimal("0") + assert result["subtotal"] > Decimal("0") + assert result["tax"] > Decimal("0") + assert result["savings_total"] > Decimal("0") + + def test_receipt_id_preserved(self, meijer_receipt_data): + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + assert result["receipt_id"] == "TXN-2026-0310-001" + + def test_known_product_prices(self, meijer_receipt_data): + """Verify specific Meijer products produce correct price extraction.""" + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + items_by_name = {i["product_name_raw"]: i for i in result["items"]} + + # Bananas: $0.69 + bananas = items_by_name["ORGANIC BANANAS"] + assert bananas["unit_price"] == Decimal("0.69") + assert bananas["mperks_discount"] if "mperks_discount" in bananas else True + assert bananas["loyalty_discount"] == Decimal("0.10") + + # Milk: qty 2, $3.49 each, total $6.98 + milk = items_by_name["MEIJER 2% MILK GAL"] + assert milk["quantity"] == Decimal("2") + assert milk["unit_price"] == Decimal("3.49") + assert milk["extended_price"] == Decimal("6.98") + + # Weighted deli turkey: 0.75 lb at $8.99/lb + turkey = items_by_name["WEIGHTED DELI TURKEY"] + assert turkey["quantity"] == Decimal("0.75") + assert turkey["upc"] is None + + def test_mperks_discount_captured(self, meijer_receipt_data): + """Paper towels has $1.00 mPerks discount.""" + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + towels = [i for i in result["items"] if "PAPER TOWELS" in i["product_name_raw"]][0] + assert towels["loyalty_discount"] == Decimal("1.00") + assert towels["coupon_discount"] == Decimal("1.00") + + def test_cheerios_coupon_discount(self, meijer_receipt_data): + """Cheerios has $0.50 coupon.""" + raw = RawReceipt( + receipt_id="TXN-2026-0310-001", + purchase_date="2026-03-10T14:30:00Z", + raw_data=meijer_receipt_data, + ) + result = parse_meijer_receipt(raw) + cheerios = [i for i in result["items"] if "CHEERIOS" in i["product_name_raw"]][0] + assert cheerios["coupon_discount"] == Decimal("0.50") + + +class TestEmptyAndEdgeCaseSchemas: + """Regression tests for edge-case receipts that should not crash.""" + + def test_kroger_empty_receipt(self): + raw = RawReceipt(receipt_id="KR-EMPTY", purchase_date="2026-03-12", raw_data={}) + result = parse_kroger_receipt(raw) + _validate_receipt_schema(result) + assert result["items"] == [] + assert result["total"] == Decimal("0") + + def test_meijer_empty_receipt(self): + raw = RawReceipt(receipt_id="MJ-EMPTY", purchase_date="2026-03-10", raw_data={}) + result = parse_meijer_receipt(raw) + _validate_receipt_schema(result) + assert result["items"] == [] + assert result["total"] == Decimal("0") + + def test_kroger_receipt_no_detail(self): + raw = RawReceipt( + receipt_id="KR-NODET", + purchase_date="2026-03-12", + raw_data={"total": 50.00}, + ) + result = parse_kroger_receipt(raw) + _validate_receipt_schema(result) + assert result["items"] == [] + assert result["total"] == Decimal("50.00") + + def test_meijer_receipt_no_detail(self): + raw = RawReceipt( + receipt_id="MJ-NODET", + purchase_date="2026-03-10", + raw_data={"total": 30.00}, + ) + result = parse_meijer_receipt(raw) + _validate_receipt_schema(result) + assert result["items"] == [] + assert result["total"] == Decimal("30.00") + + def test_kroger_receipt_all_voided(self): + """A receipt where every item is voided should have 0 items.""" + raw = RawReceipt( + receipt_id="KR-ALLVOID", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [ + {"description": "VOIDED A", "basePrice": 5.0, "voided": True}, + {"description": "VOIDED B", "basePrice": 3.0, "status": "VOIDED"}, + {"description": "RETURNED C", "basePrice": 7.0, "status": "RETURNED"}, + {"description": "RETURNED D", "basePrice": 2.0, "returnFlag": True}, + ], + "total": 0, + } + }, + ) + result = parse_kroger_receipt(raw) + _validate_receipt_schema(result) + assert result["items"] == [] + + def test_meijer_receipt_all_voided(self): + raw = RawReceipt( + receipt_id="MJ-ALLVOID", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [ + {"description": "VOIDED A", "price": 5.0, "voided": True}, + {"description": "VOIDED B", "price": 3.0, "status": "VOIDED"}, + ], + "total": 0, + } + }, + ) + result = parse_meijer_receipt(raw) + _validate_receipt_schema(result) + assert result["items"] == [] diff --git a/tests/test_scrapers/__init__.py b/tests/test_scrapers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_scrapers/test_base.py b/tests/test_scrapers/test_base.py new file mode 100644 index 0000000..d0cabac --- /dev/null +++ b/tests/test_scrapers/test_base.py @@ -0,0 +1,58 @@ +"""Tests for the base scraper class.""" + +from datetime import datetime +from unittest.mock import patch + +import pytest + +from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData + + +class ConcreteScraper(BaseScraper): + """Concrete implementation for testing the abstract base.""" + + async def login(self, username, password): + return SessionData( + cookies=[], + user_agent="test", + created_at=datetime.now(), + ) + + async def check_session(self, session): + return True + + async def scrape_receipts(self, session, since=None): + return [] + + def parse_receipt(self, raw): + return {} + + +class TestBaseScraper: + @pytest.mark.asyncio + async def test_human_delay_respects_bounds(self): + scraper = ConcreteScraper() + with patch("receiptwitness.scrapers.base.asyncio.sleep") as mock_sleep: + mock_sleep.return_value = None + await scraper.human_delay(min_ms=100, max_ms=200) + call_args = mock_sleep.call_args[0][0] + assert 0.1 <= call_args <= 0.2 + + def test_raw_receipt_dataclass(self): + receipt = RawReceipt( + receipt_id="test-123", + purchase_date="2026-03-10", + store_number="42", + raw_data={"key": "value"}, + ) + assert receipt.receipt_id == "test-123" + assert receipt.raw_data == {"key": "value"} + + def test_session_data_defaults(self): + session = SessionData( + cookies=[], + user_agent="test", + created_at=datetime.now(), + ) + assert session.expires_at is None + assert session.extra == {} diff --git a/tests/test_scrapers/test_kroger_scraper.py b/tests/test_scrapers/test_kroger_scraper.py new file mode 100644 index 0000000..3a88516 --- /dev/null +++ b/tests/test_scrapers/test_kroger_scraper.py @@ -0,0 +1,574 @@ +"""Tests for the Kroger scraper. + +These tests mock Playwright to avoid requiring real Kroger credentials +or network access. They verify the scraper's control flow, session handling, +date filtering, and error resilience. +""" + +from datetime import UTC, datetime, timedelta +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from receiptwitness.scrapers.base import RawReceipt, SessionData +from receiptwitness.scrapers.kroger import ( + DEFAULT_TIMEZONE, + DEFAULT_USER_AGENT, + DEFAULT_VIEWPORT, + KROGER_BASE, + KROGER_LOGIN_PAGE, + KROGER_PURCHASE_HISTORY, + KrogerScraper, +) + + +@pytest.fixture +def scraper(): + return KrogerScraper() + + +@pytest.fixture +def valid_session(): + return SessionData( + cookies=[{"name": "session", "value": "abc123", "domain": ".kroger.com", "path": "/"}], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=datetime.now(UTC) + timedelta(hours=2), + extra={"retailer": "kroger"}, + ) + + +@pytest.fixture +def expired_session(): + return SessionData( + cookies=[{"name": "session", "value": "expired", "domain": ".kroger.com", "path": "/"}], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC) - timedelta(hours=4), + expires_at=datetime.now(UTC) - timedelta(hours=2), + ) + + +class TestKrogerScraperConstants: + def test_base_url(self): + assert KROGER_BASE == "https://www.kroger.com" + + def test_login_page(self): + assert KROGER_LOGIN_PAGE == "https://www.kroger.com/signin" + + def test_purchase_history_page(self): + assert KROGER_PURCHASE_HISTORY == "https://www.kroger.com/mypurchases" + + def test_default_user_agent_is_chrome(self): + assert "Chrome" in DEFAULT_USER_AGENT + assert "Windows" in DEFAULT_USER_AGENT + + def test_default_viewport_hd(self): + assert DEFAULT_VIEWPORT == {"width": 1920, "height": 1080} + + def test_default_timezone(self): + assert DEFAULT_TIMEZONE == "America/New_York" + + +class TestCheckSession: + @pytest.mark.asyncio + async def test_expired_session_returns_false(self, scraper, expired_session): + result = await scraper.check_session(expired_session) + assert result is False + + @pytest.mark.asyncio + async def test_no_expiry_checks_via_browser(self, scraper): + session = SessionData( + cookies=[], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=None, + ) + mock_page = AsyncMock() + mock_page.url = "https://www.kroger.com/account/dashboard" + mock_response = MagicMock() + mock_response.ok = True + mock_page.goto = AsyncMock(return_value=mock_response) + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw: + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + result = await scraper.check_session(session) + assert result is True + + @pytest.mark.asyncio + async def test_session_redirected_to_signin_returns_false(self, scraper): + session = SessionData( + cookies=[], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=None, + ) + mock_page = AsyncMock() + mock_page.url = "https://www.kroger.com/signin?redirectUrl=account" + mock_response = MagicMock() + mock_response.ok = True + mock_page.goto = AsyncMock(return_value=mock_response) + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw: + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + result = await scraper.check_session(session) + assert result is False + + +class TestLogin: + @pytest.mark.asyncio + async def test_login_returns_session_data(self, scraper): + mock_page = AsyncMock() + mock_page.url = "https://www.kroger.com/" + + # Mock locator chain + mock_email = AsyncMock() + mock_password = AsyncMock() + mock_button = AsyncMock() + mock_page.locator = MagicMock(side_effect=[mock_email, mock_password, mock_button]) + mock_page.wait_for_url = AsyncMock() + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.cookies = AsyncMock( + return_value=[ + {"name": "kroger_session", "value": "test123", "domain": ".kroger.com", "path": "/"} + ] + ) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + session = await scraper.login("user@test.com", "password123") + + assert isinstance(session, SessionData) + assert len(session.cookies) == 1 + assert session.cookies[0]["name"] == "kroger_session" + assert session.user_agent == DEFAULT_USER_AGENT + assert session.expires_at is not None + assert session.extra == {"retailer": "kroger"} + + +class TestScrapeReceipts: + @pytest.mark.asyncio + async def test_scrape_returns_receipts(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.status = 200 + mock_api_response.json = AsyncMock( + return_value={ + "orders": [ + { + "orderId": "KR-001", + "purchaseDate": "2026-03-10T14:00:00Z", + "storeNumber": "357", + }, + { + "orderId": "KR-002", + "purchaseDate": "2026-03-11T10:00:00Z", + "storeNumber": "357", + }, + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={"items": []}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock( + side_effect=[mock_api_response, mock_detail_response, mock_detail_response] + ) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + + assert len(receipts) == 2 + assert receipts[0].receipt_id == "KR-001" + assert receipts[1].receipt_id == "KR-002" + assert isinstance(receipts[0], RawReceipt) + + @pytest.mark.asyncio + async def test_scrape_filters_by_date(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "orders": [ + { + "orderId": "KR-OLD", + "purchaseDate": "2026-01-01T10:00:00Z", + "storeNumber": "357", + }, + { + "orderId": "KR-NEW", + "purchaseDate": "2026-03-15T10:00:00Z", + "storeNumber": "357", + }, + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + since = datetime(2026, 3, 1, tzinfo=UTC) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session, since=since) + + assert len(receipts) == 1 + assert receipts[0].receipt_id == "KR-NEW" + + @pytest.mark.asyncio + async def test_scrape_handles_api_failure(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = False + mock_api_response.status = 500 + mock_api_response.status_text = "Internal Server Error" + + mock_request = AsyncMock() + mock_request.get = AsyncMock(return_value=mock_api_response) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert receipts == [] + + @pytest.mark.asyncio + async def test_scrape_handles_unexpected_response(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock(return_value="not a dict") + + mock_request = AsyncMock() + mock_request.get = AsyncMock(return_value=mock_api_response) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert receipts == [] + + @pytest.mark.asyncio + async def test_scrape_alternative_field_names(self, scraper, valid_session): + """Kroger may use 'purchases' instead of 'orders'.""" + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "purchases": [ + { + "receiptId": "KR-ALT-001", + "transactionDate": "2026-03-10T14:00:00Z", + "divisionNumber": "014", + } + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + + assert len(receipts) == 1 + assert receipts[0].receipt_id == "KR-ALT-001" + + @pytest.mark.asyncio + async def test_scrape_skips_orders_without_id(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "orders": [ + {"purchaseDate": "2026-03-10T14:00:00Z"}, # no id + {"orderId": "KR-VALID", "purchaseDate": "2026-03-10T14:00:00Z"}, + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert len(receipts) == 1 + assert receipts[0].receipt_id == "KR-VALID" + + @pytest.mark.asyncio + async def test_scrape_skips_orders_with_null_id(self, scraper, valid_session): + """Ensure orderId: null doesn't produce receipt_id='None' (str(None) bug).""" + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "orders": [ + {"orderId": None, "receiptId": None, "purchaseDate": "2026-03-10T14:00:00Z"}, + {"orderId": "KR-REAL", "purchaseDate": "2026-03-10T14:00:00Z"}, + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert len(receipts) == 1 + assert receipts[0].receipt_id == "KR-REAL" + # Verify no receipt has the string "None" as its ID + assert all(r.receipt_id != "None" for r in receipts) + + +class TestParseReceipt: + def test_parse_receipt_delegates_to_parser(self, scraper): + raw = RawReceipt( + receipt_id="KR-001", + purchase_date="2026-03-12", + raw_data={ + "detail": { + "items": [ + { + "description": "TEST ITEM", + "basePrice": 5.00, + "totalPrice": 5.00, + } + ], + "total": 5.00, + } + }, + ) + result = scraper.parse_receipt(raw) + assert result["receipt_id"] == "KR-001" + assert len(result["items"]) == 1 + + def test_receipt_detail_failure_returns_empty(self, scraper): + """Verify receipt detail failures produce empty detail.""" + raw = RawReceipt( + receipt_id="KR-FAIL", + purchase_date="2026-03-12", + raw_data={"total": 10.00, "detail": {}}, + ) + result = scraper.parse_receipt(raw) + assert result["receipt_id"] == "KR-FAIL" + assert result["items"] == [] diff --git a/tests/test_scrapers/test_meijer_scraper.py b/tests/test_scrapers/test_meijer_scraper.py new file mode 100644 index 0000000..05664e1 --- /dev/null +++ b/tests/test_scrapers/test_meijer_scraper.py @@ -0,0 +1,585 @@ +"""Tests for the Meijer scraper. + +These tests mock Playwright to avoid requiring real Meijer credentials +or network access. They verify the scraper's control flow, session handling, +date filtering, and error resilience. +""" + +from datetime import UTC, datetime, timedelta +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from receiptwitness.scrapers.base import RawReceipt, SessionData +from receiptwitness.scrapers.meijer import ( + DEFAULT_TIMEZONE, + DEFAULT_USER_AGENT, + DEFAULT_VIEWPORT, + MEIJER_BASE, + MEIJER_LOGIN_PAGE, + MEIJER_MPERKS_HOME, + MEIJER_PURCHASE_HISTORY, + MeijerScraper, +) + + +@pytest.fixture +def scraper(): + return MeijerScraper() + + +@pytest.fixture +def valid_session(): + return SessionData( + cookies=[ + {"name": "meijer_session", "value": "abc123", "domain": ".meijer.com", "path": "/"} + ], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=datetime.now(UTC) + timedelta(hours=4), + ) + + +@pytest.fixture +def expired_session(): + return SessionData( + cookies=[ + {"name": "meijer_session", "value": "expired", "domain": ".meijer.com", "path": "/"} + ], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC) - timedelta(hours=8), + expires_at=datetime.now(UTC) - timedelta(hours=4), + ) + + +class TestMeijerScraperConstants: + def test_base_url(self): + assert MEIJER_BASE == "https://www.meijer.com" + + def test_login_page(self): + assert MEIJER_LOGIN_PAGE == "https://www.meijer.com/shopping/login.html" + + def test_mperks_home(self): + assert MEIJER_MPERKS_HOME == "https://www.meijer.com/mperks.html" + + def test_purchase_history_url(self): + assert ( + MEIJER_PURCHASE_HISTORY == "https://www.meijer.com/bin/meijer/profile/purchasehistory" + ) + + def test_default_user_agent_is_chrome(self): + assert "Chrome" in DEFAULT_USER_AGENT + assert "Windows" in DEFAULT_USER_AGENT + + def test_default_viewport_hd(self): + assert DEFAULT_VIEWPORT == {"width": 1920, "height": 1080} + + def test_default_timezone(self): + assert DEFAULT_TIMEZONE == "America/Detroit" + + +class TestCheckSession: + @pytest.mark.asyncio + async def test_expired_session_returns_false(self, scraper, expired_session): + result = await scraper.check_session(expired_session) + assert result is False + + @pytest.mark.asyncio + async def test_no_expiry_checks_via_browser(self, scraper): + session = SessionData( + cookies=[], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=None, + ) + mock_page = AsyncMock() + mock_page.url = "https://www.meijer.com/mperks.html" + mock_response = MagicMock() + mock_response.ok = True + mock_page.goto = AsyncMock(return_value=mock_response) + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw: + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + result = await scraper.check_session(session) + assert result is True + + @pytest.mark.asyncio + async def test_session_redirected_to_login_returns_false(self, scraper): + session = SessionData( + cookies=[], + user_agent=DEFAULT_USER_AGENT, + created_at=datetime.now(UTC), + expires_at=None, + ) + mock_page = AsyncMock() + mock_page.url = "https://www.meijer.com/shopping/login.html?redirect=mperks" + mock_response = MagicMock() + mock_response.ok = True + mock_page.goto = AsyncMock(return_value=mock_response) + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw: + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + result = await scraper.check_session(session) + assert result is False + + +class TestLogin: + @pytest.mark.asyncio + async def test_login_returns_session_data(self, scraper): + mock_page = AsyncMock() + mock_page.url = "https://www.meijer.com/mperks.html" + + # Mock locator chain + mock_email = AsyncMock() + mock_password = AsyncMock() + mock_button = AsyncMock() + mock_page.locator = MagicMock(side_effect=[mock_email, mock_password, mock_button]) + mock_page.wait_for_url = AsyncMock() + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.cookies = AsyncMock( + return_value=[ + {"name": "meijer_session", "value": "test456", "domain": ".meijer.com", "path": "/"} + ] + ) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + session = await scraper.login("user@test.com", "password123") + + assert isinstance(session, SessionData) + assert len(session.cookies) == 1 + assert session.cookies[0]["name"] == "meijer_session" + assert session.user_agent == DEFAULT_USER_AGENT + assert session.expires_at is not None + # Meijer sessions last 4 hours + assert session.expires_at > session.created_at + timedelta(hours=3) + + +class TestScrapeReceipts: + @pytest.mark.asyncio + async def test_scrape_returns_receipts(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.status = 200 + mock_api_response.json = AsyncMock( + return_value={ + "transactions": [ + { + "transactionId": "TXN-001", + "transactionDate": "2026-03-10T14:00:00Z", + "storeNumber": "42", + }, + { + "transactionId": "TXN-002", + "transactionDate": "2026-03-11T10:00:00Z", + "storeNumber": "42", + }, + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={"items": []}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock( + side_effect=[mock_api_response, mock_detail_response, mock_detail_response] + ) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + + assert len(receipts) == 2 + assert receipts[0].receipt_id == "TXN-001" + assert receipts[1].receipt_id == "TXN-002" + assert isinstance(receipts[0], RawReceipt) + + @pytest.mark.asyncio + async def test_scrape_filters_by_date(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "transactions": [ + { + "transactionId": "TXN-OLD", + "transactionDate": "2026-01-01T10:00:00Z", + "storeNumber": "42", + }, + { + "transactionId": "TXN-NEW", + "transactionDate": "2026-03-15T10:00:00Z", + "storeNumber": "42", + }, + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + since = datetime(2026, 3, 1, tzinfo=UTC) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session, since=since) + + assert len(receipts) == 1 + assert receipts[0].receipt_id == "TXN-NEW" + + @pytest.mark.asyncio + async def test_scrape_handles_api_failure(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = False + mock_api_response.status = 500 + mock_api_response.status_text = "Internal Server Error" + + mock_request = AsyncMock() + mock_request.get = AsyncMock(return_value=mock_api_response) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert receipts == [] + + @pytest.mark.asyncio + async def test_scrape_handles_unexpected_response(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock(return_value="not a dict") + + mock_request = AsyncMock() + mock_request.get = AsyncMock(return_value=mock_api_response) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert receipts == [] + + @pytest.mark.asyncio + async def test_scrape_alternative_field_names(self, scraper, valid_session): + """Meijer may use 'purchaseHistory' instead of 'transactions'.""" + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "purchaseHistory": [ + { + "receiptId": "MJ-ALT-001", + "purchaseDate": "2026-03-10T14:00:00Z", + "storeId": "99", + } + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + + assert len(receipts) == 1 + assert receipts[0].receipt_id == "MJ-ALT-001" + + @pytest.mark.asyncio + async def test_scrape_skips_transactions_without_id(self, scraper, valid_session): + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "transactions": [ + {"transactionDate": "2026-03-10T14:00:00Z"}, # no id + {"transactionId": "TXN-VALID", "transactionDate": "2026-03-10T14:00:00Z"}, + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = True + mock_detail_response.json = AsyncMock(return_value={}) + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert len(receipts) == 1 + assert receipts[0].receipt_id == "TXN-VALID" + + @pytest.mark.asyncio + async def test_scrape_receipt_detail_failure_returns_empty_detail(self, scraper, valid_session): + """Receipt detail API failure should not crash the scraper.""" + mock_api_response = AsyncMock() + mock_api_response.ok = True + mock_api_response.json = AsyncMock( + return_value={ + "transactions": [ + { + "transactionId": "TXN-DETAIL-FAIL", + "transactionDate": "2026-03-10T14:00:00Z", + "storeNumber": "42", + } + ] + } + ) + + mock_detail_response = AsyncMock() + mock_detail_response.ok = False + mock_detail_response.status = 404 + + mock_request = AsyncMock() + mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response]) + + mock_page = AsyncMock() + mock_page.goto = AsyncMock() + mock_page.request = mock_request + + mock_context = AsyncMock() + mock_context.new_page = AsyncMock(return_value=mock_page) + mock_context.add_cookies = AsyncMock() + mock_context.add_init_script = AsyncMock() + mock_browser = AsyncMock() + mock_browser.new_context = AsyncMock(return_value=mock_context) + mock_context.browser = mock_browser + + mock_pw = AsyncMock() + mock_pw.chromium.launch = AsyncMock(return_value=mock_browser) + + with ( + patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw, + patch.object(scraper, "human_delay", new_callable=AsyncMock), + ): + mock_cm = AsyncMock() + mock_cm.__aenter__ = AsyncMock(return_value=mock_pw) + mock_cm.__aexit__ = AsyncMock(return_value=False) + mock_apw.return_value = mock_cm + + receipts = await scraper.scrape_receipts(valid_session) + assert len(receipts) == 1 + assert receipts[0].receipt_id == "TXN-DETAIL-FAIL" + assert receipts[0].raw_data.get("detail") == {} + + +class TestParseReceipt: + def test_parse_receipt_delegates_to_parser(self, scraper): + raw = RawReceipt( + receipt_id="TXN-001", + purchase_date="2026-03-10", + raw_data={ + "detail": { + "items": [ + { + "description": "TEST ITEM", + "price": 5.00, + "extendedPrice": 5.00, + } + ], + "total": 5.00, + } + }, + ) + result = scraper.parse_receipt(raw) + assert result["receipt_id"] == "TXN-001" + assert len(result["items"]) == 1 + + def test_receipt_detail_failure_returns_empty(self, scraper): + raw = RawReceipt( + receipt_id="TXN-FAIL", + purchase_date="2026-03-10", + raw_data={"total": 10.00, "detail": {}}, + ) + result = scraper.parse_receipt(raw) + assert result["receipt_id"] == "TXN-FAIL" + assert result["items"] == [] diff --git a/tests/test_session/__init__.py b/tests/test_session/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_session/test_encryption.py b/tests/test_session/test_encryption.py new file mode 100644 index 0000000..59a57fa --- /dev/null +++ b/tests/test_session/test_encryption.py @@ -0,0 +1,61 @@ +"""Tests for session encryption/decryption.""" + +from unittest.mock import patch + +import pytest +from cryptography.fernet import Fernet, InvalidToken + +from receiptwitness.session.encryption import decrypt_session_data, encrypt_session_data + +TEST_KEY = Fernet.generate_key().decode() + + +@pytest.fixture(autouse=True) +def _mock_encryption_key(): + with patch("receiptwitness.session.encryption.settings") as mock_settings: + mock_settings.session_encryption_key = TEST_KEY + yield + + +class TestEncryptDecrypt: + def test_roundtrip(self): + data = { + "cookies": [{"name": "session", "value": "abc123", "domain": ".meijer.com"}], + "user_agent": "Mozilla/5.0", + } + encrypted = encrypt_session_data(data) + assert isinstance(encrypted, str) + assert encrypted != str(data) + + decrypted = decrypt_session_data(encrypted) + assert decrypted == data + + def test_different_data_different_ciphertext(self): + data1 = {"key": "value1"} + data2 = {"key": "value2"} + enc1 = encrypt_session_data(data1) + enc2 = encrypt_session_data(data2) + assert enc1 != enc2 + + def test_decrypt_with_wrong_key_fails(self): + data = {"cookies": []} + encrypted = encrypt_session_data(data) + + wrong_key = Fernet.generate_key().decode() + with patch("receiptwitness.session.encryption.settings") as mock_settings: + mock_settings.session_encryption_key = wrong_key + with pytest.raises(InvalidToken): + decrypt_session_data(encrypted) + + def test_decrypt_tampered_data_fails(self): + data = {"cookies": []} + encrypted = encrypt_session_data(data) + tampered = encrypted[:-5] + "XXXXX" + with pytest.raises(Exception): + decrypt_session_data(tampered) + + def test_no_key_raises_error(self): + with patch("receiptwitness.session.encryption.settings") as mock_settings: + mock_settings.session_encryption_key = "" + with pytest.raises(ValueError, match="RW_SESSION_ENCRYPTION_KEY"): + encrypt_session_data({"test": True}) diff --git a/tests/test_session/test_manager.py b/tests/test_session/test_manager.py new file mode 100644 index 0000000..68e1015 --- /dev/null +++ b/tests/test_session/test_manager.py @@ -0,0 +1,102 @@ +"""Tests for session manager logic.""" + +from datetime import UTC, datetime, timedelta +from unittest.mock import AsyncMock, patch + +import pytest +from cryptography.fernet import Fernet + +from receiptwitness.scrapers.base import SessionData +from receiptwitness.session.manager import ( + get_valid_session, + session_from_db_record, + session_to_db_value, +) + +TEST_KEY = Fernet.generate_key().decode() + + +@pytest.fixture(autouse=True) +def _mock_encryption_key(): + with patch("receiptwitness.session.encryption.settings") as mock_settings: + mock_settings.session_encryption_key = TEST_KEY + yield + + +def _make_session(hours_until_expire: int = 4) -> SessionData: + now = datetime.now(UTC) + return SessionData( + cookies=[{"name": "sid", "value": "test", "domain": ".meijer.com"}], + user_agent="Mozilla/5.0", + created_at=now, + expires_at=now + timedelta(hours=hours_until_expire), + ) + + +class TestSessionSerialization: + def test_roundtrip(self): + session = _make_session() + db_value = session_to_db_value(session) + restored = session_from_db_record(db_value) + + assert restored is not None + assert restored.cookies == session.cookies + assert restored.user_agent == session.user_agent + + def test_none_returns_none(self): + assert session_from_db_record(None) is None + + def test_invalid_encrypted_returns_none(self): + assert session_from_db_record("garbage-data") is None + + +class TestGetValidSession: + @pytest.mark.asyncio + async def test_valid_existing_session(self): + session = _make_session() + db_value = session_to_db_value(session) + + scraper = AsyncMock() + scraper.check_session.return_value = True + + result, was_refreshed = await get_valid_session(scraper, db_value, "user", "pass") + assert not was_refreshed + assert result.cookies == session.cookies + scraper.login.assert_not_called() + + @pytest.mark.asyncio + async def test_expired_session_triggers_login(self): + session = _make_session(hours_until_expire=-1) # already expired + db_value = session_to_db_value(session) + + new_session = _make_session() + scraper = AsyncMock() + scraper.login.return_value = new_session + + result, was_refreshed = await get_valid_session(scraper, db_value, "user", "pass") + assert was_refreshed + scraper.login.assert_called_once_with("user", "pass") + + @pytest.mark.asyncio + async def test_no_existing_session_triggers_login(self): + new_session = _make_session() + scraper = AsyncMock() + scraper.login.return_value = new_session + + result, was_refreshed = await get_valid_session(scraper, None, "user", "pass") + assert was_refreshed + scraper.login.assert_called_once() + + @pytest.mark.asyncio + async def test_failed_session_check_triggers_login(self): + session = _make_session() + db_value = session_to_db_value(session) + + new_session = _make_session() + scraper = AsyncMock() + scraper.check_session.return_value = False + scraper.login.return_value = new_session + + result, was_refreshed = await get_valid_session(scraper, db_value, "user", "pass") + assert was_refreshed + scraper.login.assert_called_once()