From 253fd8464f4ef066200b47aa8f5c2162cb853898 Mon Sep 17 00:00:00 2001
From: Coupon Carl <coupon-carl@cartsnitch.com>
Date: Sat, 28 Mar 2026 02:24:22 +0000
Subject: [PATCH] Squashed 'receiptwitness/' content from commit e8d374a

git-subtree-dir: receiptwitness
git-subtree-split: e8d374a89ed8978f429598e02d31b1c5963efe22
---
 .dockerignore                                |  12 +
 .gitignore                                   |   7 +
 CLAUDE.md                                    | 227 ++++++++++++
 Dockerfile                                   |  67 ++++
 renovate.json                                |   4 +
 src/receiptwitness/__init__.py               |   1 +
 src/receiptwitness/api/__init__.py           |   1 +
 src/receiptwitness/api/routes.py             |  10 +
 src/receiptwitness/config.py                 |  26 ++
 src/receiptwitness/events.py                 |  75 ++++
 src/receiptwitness/main.py                   |   8 +
 src/receiptwitness/parsers/__init__.py       |   1 +
 src/receiptwitness/parsers/kroger.py         | 148 ++++++++
 src/receiptwitness/parsers/meijer.py         | 138 ++++++++
 src/receiptwitness/parsers/target.py         | 191 ++++++++++
 src/receiptwitness/pipeline/__init__.py      |  30 ++
 src/receiptwitness/pipeline/matching.py      | 136 ++++++++
 src/receiptwitness/pipeline/normalization.py | 155 +++++++++
 src/receiptwitness/pipeline/receipt.py       | 144 ++++++++
 src/receiptwitness/scrapers/__init__.py      |   1 +
 src/receiptwitness/scrapers/base.py          |  72 ++++
 src/receiptwitness/scrapers/kroger.py        | 344 +++++++++++++++++++
 src/receiptwitness/scrapers/meijer.py        | 301 ++++++++++++++++
 src/receiptwitness/scrapers/target.py        | 326 ++++++++++++++++++
 src/receiptwitness/session/__init__.py       |   1 +
 src/receiptwitness/session/encryption.py     |  52 +++
 src/receiptwitness/session/manager.py        |  81 +++++
 27 files changed, 2559 insertions(+)
 create mode 100644 .dockerignore
 create mode 100644 .gitignore
 create mode 100644 CLAUDE.md
 create mode 100644 Dockerfile
 create mode 100644 renovate.json
 create mode 100644 src/receiptwitness/__init__.py
 create mode 100644 src/receiptwitness/api/__init__.py
 create mode 100644 src/receiptwitness/api/routes.py
 create mode 100644 src/receiptwitness/config.py
 create mode 100644 src/receiptwitness/events.py
 create mode 100644 src/receiptwitness/main.py
 create mode 100644 src/receiptwitness/parsers/__init__.py
 create mode 100644 src/receiptwitness/parsers/kroger.py
 create mode 100644 src/receiptwitness/parsers/meijer.py
 create mode 100644 src/receiptwitness/parsers/target.py
 create mode 100644 src/receiptwitness/pipeline/__init__.py
 create mode 100644 src/receiptwitness/pipeline/matching.py
 create mode 100644 src/receiptwitness/pipeline/normalization.py
 create mode 100644 src/receiptwitness/pipeline/receipt.py
 create mode 100644 src/receiptwitness/scrapers/__init__.py
 create mode 100644 src/receiptwitness/scrapers/base.py
 create mode 100644 src/receiptwitness/scrapers/kroger.py
 create mode 100644 src/receiptwitness/scrapers/meijer.py
 create mode 100644 src/receiptwitness/scrapers/target.py
 create mode 100644 src/receiptwitness/session/__init__.py
 create mode 100644 src/receiptwitness/session/encryption.py
 create mode 100644 src/receiptwitness/session/manager.py

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..289a751
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,12 @@
+__pycache__/
+*.pyc
+.pytest_cache/
+*.egg-info/
+dist/
+.venv/
+.env
+.git/
+.github/
+tests/
+*.md
+renovate.json
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..687387e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+__pycache__/
+*.pyc
+.pytest_cache/
+*.egg-info/
+dist/
+.venv/
+.env
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..255b742
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,227 @@
+# ReceiptWitness — CartSnitch Receipt Ingestion Service
+
+## Project Context
+
+CartSnitch is a self-hosted grocery price intelligence platform built as a polyrepo microservices architecture. This repo (`cartsnitch/receiptwitness`) is the receipt/purchase history ingestion service.
+
+**GitHub org:** github.com/cartsnitch
+**Domain:** cartsnitch.com
+
+### CartSnitch Services
+
+| Repo | Service | Purpose |
+|------|---------|---------|
+| `cartsnitch/common` | — | Shared models, schemas, utilities |
+| `cartsnitch/receiptwitness` | ReceiptWitness | Purchase data ingestion via retailer scrapers (this repo) |
+| `cartsnitch/api` | API Gateway | Frontend-facing REST API |
+| `cartsnitch/cartsnitch` | Frontend | React PWA (mobile-first) |
+| `cartsnitch/stickershock` | StickerShock | Price increase detection & CPI comparison |
+| `cartsnitch/shrinkray` | ShrinkRay | Shrinkflation monitoring |
+| `cartsnitch/clipartist` | ClipArtist | Coupon/deal watching & shopping optimization |
+| `cartsnitch/infra` | — | K8s manifests, Flux kustomizations |
+
+### Architecture Decisions
+
+- **Polyrepo:** Each service has its own repo, Dockerfile, CI/CD pipeline.
+- **Shared DB:** One PostgreSQL cluster. This service writes to `purchases`, `purchase_items`, `price_history` tables. Models come from `cartsnitch-common`.
+- **Inter-service comms:** REST (synchronous) + Redis pub/sub (async events).
+- **Target scale:** 500–1,000 users. Each user has their own authenticated sessions to up to 3 retailers.
+
+## What This Service Does
+
+ReceiptWitness authenticates with grocery retailer web portals using per-user sessions, scrapes purchase history / receipt data, parses it into structured records, and writes it to the shared database. After ingestion, it publishes a `cartsnitch.receipts.ingested` event so downstream services (StickerShock, ClipArtist) can react.
+
+### Target Retailers (MVP)
+
+#### Meijer (mPerks)
+- **Auth:** No public API. Session cookie-based auth on mperks.meijer.com.
+- **Receipt location:** meijer.com/mperks/receipts-savings.html (or underlying XHR endpoints)
+- **Approach:** Playwright login → capture session → hit receipt XHR endpoints directly. Map the API calls the frontend makes via browser dev tools network tab.
+- **Prior art:** `dapperfu/python_Meijer` (requires MITM proxy for auth — avoid this pattern, prefer direct browser automation).
+- **Data available:** Digital receipts appear ~15 minutes after purchase if mPerks ID was used at checkout. Includes item names, prices, discounts, savings.
+
+#### Kroger
+- **Auth:** No public API for purchase history (that's behind Partner API). Session cookie-based auth on kroger.com.
+- **Receipt location:** kroger.com/mypurchases
+- **Approach:** Playwright login → scrape purchase history pages or intercept XHR endpoints.
+- **Anti-bot:** Kroger uses Akamai Bot Manager. Aggressive headless browser detection. Need Playwright stealth, realistic fingerprinting, human-like interaction pacing.
+- **Prior art:** `phyllis-vance/KrogerScrape` (.NET, old), `callaginn/kroger-sweeper` (Puppeteer/Node), `ThermoMan/Get-Kroger-Grocery-List` (Greasemonkey userscript).
+- **Kroger public API:** Free developer account at developer.kroger.com provides product catalog data (`product.compact` scope) — useful for enriching scraped receipt data with UPCs, categories, product images. NOT useful for purchase history.
+- **Data available:** Purchase history tied to Kroger Plus loyalty card. Shows items, prices, quantities.
+
+#### Target (Circle)
+- **Auth:** Session-based auth on target.com.
+- **Receipt location:** target.com account → Orders → In-store tab, or target.com/account/orders
+- **Approach:** Playwright login → scrape in-store purchase history.
+- **Data available:** ~1 year of history if user paid with a linked card, used the Target app wallet, or entered their Target Circle phone number at checkout. Includes item names, prices.
+
+## Tech Stack
+
+- Python 3.12+
+- Playwright (Python async API) for headless browser automation
+- FastAPI (lightweight internal API for triggering scrapes, health checks, status)
+- SQLAlchemy 2.0 (via `cartsnitch-common`)
+- Redis (pub/sub event publishing)
+- APScheduler or Celery (for scheduled scraping jobs)
+- cryptography / Fernet (encrypting stored session data)
+
+## Repo Structure
+
+```
+receiptwitness/
+├── CLAUDE.md
+├── README.md
+├── pyproject.toml
+├── Dockerfile                  # Playwright + Chromium headless
+├── docker-compose.yml          # Local dev (Postgres, Redis, this service)
+├── src/
+│   └── receiptwitness/
+│       ├── __init__.py
+│       ├── config.py           # Service-specific settings
+│       ├── main.py             # FastAPI app + scheduler bootstrap
+│       ├── scrapers/
+│       │   ├── __init__.py
+│       │   ├── base.py         # Abstract BaseScraper class
+│       │   ├── meijer.py       # Meijer/mPerks scraper
+│       │   ├── kroger.py       # Kroger scraper
+│       │   └── target.py       # Target/Circle scraper
+│       ├── parsers/
+│       │   ├── __init__.py
+│       │   ├── meijer.py       # Parse raw Meijer receipt data → PurchaseItem records
+│       │   ├── kroger.py
+│       │   └── target.py
+│       ├── session/
+│       │   ├── __init__.py
+│       │   ├── manager.py      # Session storage, retrieval, refresh logic
+│       │   └── encryption.py   # Encrypt/decrypt session cookies at rest
+│       ├── scheduler.py        # Scrape scheduling (per-user cron jobs)
+│       ├── events.py           # Publish receipt.ingested events to Redis
+│       ├── api/
+│       │   ├── __init__.py
+│       │   ├── routes.py       # Internal API: trigger scrape, check status, health
+│       │   └── auth.py         # Internal service auth (API key or JWT)
+│       └── enrichment.py       # Optional: enrich receipt data via Kroger public API
+└── tests/
+    ├── conftest.py
+    ├── fixtures/               # Sample receipt HTML/JSON for testing parsers
+    │   ├── meijer_receipt.json
+    │   ├── kroger_receipt.html
+    │   └── target_receipt.html
+    ├── test_scrapers/
+    ├── test_parsers/
+    └── test_session/
+```
+
+## Scraper Architecture
+
+### Base Scraper Pattern
+
+```python
+class BaseScraper(ABC):
+    """All retailer scrapers implement this interface."""
+
+    @abstractmethod
+    async def login(self, credentials: UserStoreAccount) -> SessionData: ...
+
+    @abstractmethod
+    async def check_session(self, session: SessionData) -> bool: ...
+
+    @abstractmethod
+    async def scrape_receipts(self, session: SessionData, since: datetime | None) -> list[RawReceipt]: ...
+
+    @abstractmethod
+    def parse_receipt(self, raw: RawReceipt) -> tuple[Purchase, list[PurchaseItem]]: ...
+```
+
+### Scraping Flow
+
+1. **Scheduler fires** for a user+store combination
+2. **Load session** from `user_store_accounts` table (encrypted)
+3. **Check session validity** — quick lightweight request to verify auth
+4. **If expired:** launch Playwright, re-authenticate, save new session
+5. **Scrape receipts** since `last_sync_at` timestamp
+6. **Parse** raw data into `Purchase` and `PurchaseItem` records
+7. **Deduplicate** — skip receipts already in DB (match on `receipt_id` per store)
+8. **Write to DB** — insert new purchases and items
+9. **Derive price_history** entries from purchase_items
+10. **Publish event** — `cartsnitch.receipts.ingested` to Redis
+11. **Update** `user_store_accounts.last_sync_at`
+
+### Session Management
+
+- Sessions (cookies, tokens) are encrypted at rest using Fernet symmetric encryption.
+- The encryption key is provided via environment variable, not stored in the DB.
+- Sessions are stored in the `user_store_accounts` table as encrypted JSONB.
+- Each scrape attempt first checks if the existing session is valid before launching a full Playwright browser instance.
+- When a session expires, the service needs the user's stored credentials OR a manual re-auth flow (the user logs in via the frontend, and we capture the session).
+
+### Anti-Bot Considerations
+
+- Use `playwright-stealth` or equivalent to mask automation signals.
+- Set realistic viewport sizes, user agents, and locale settings.
+- Add human-like delays between page navigations (randomized 1-5 seconds).
+- For Kroger specifically (Akamai Bot Manager): may need to use non-headless mode on initial auth, or route through a persistent browser profile that has established trust.
+- Rate limit scraping: no more than 1 scrape per user per store per hour. Default cadence: once daily.
+- Store and reuse browser profiles/cookies to minimize fresh logins.
+
+### Dockerfile
+
+The Dockerfile must include Playwright and Chromium. Base image pattern:
+
+```dockerfile
+FROM mcr.microsoft.com/playwright/python:v1.49.0-noble
+# Install deps, copy code, etc.
+```
+
+This is a large image (~2GB) due to Chromium. Consider multi-stage builds if the final image can be slimmed down.
+
+## Internal API Endpoints
+
+This service exposes a lightweight internal API (not public-facing):
+
+- `GET /health` — health check
+- `GET /status/{user_id}` — sync status per store for a user
+- `POST /scrape/{user_id}/{store_slug}` — trigger an immediate scrape for a user+store
+- `POST /scrape/{user_id}/all` — trigger scrape across all configured stores
+- `GET /sessions/{user_id}` — list configured store sessions and their status
+
+The public-facing API gateway (`cartsnitch/api`) proxies user-facing requests to this service's internal API.
+
+## Events Published
+
+### `cartsnitch.receipts.ingested`
+
+Published after new receipt data is successfully written to the DB.
+
+```json
+{
+  "event_type": "cartsnitch.receipts.ingested",
+  "timestamp": "2026-03-15T12:00:00Z",
+  "service": "receiptwitness",
+  "payload": {
+    "user_id": "uuid",
+    "store_slug": "meijer",
+    "purchase_id": "uuid",
+    "purchase_date": "2026-03-14",
+    "item_count": 23,
+    "total": 87.42
+  }
+}
+```
+
+## Development Workflow
+
+- **Never push directly to main.** Always create feature branches and open PRs.
+- Branch naming: `feature/<store>/<description>` or `fix/<description>`
+- Use conventional commits: `feat:`, `fix:`, `refactor:`, `docs:`, `chore:`
+- Test parsers with fixture data (sample receipts in `tests/fixtures/`). Scraper integration tests require real credentials and should be tagged/skipped in CI.
+- Local dev: `docker-compose up` starts Postgres, Redis, and the service. Playwright runs inside the container.
+
+## Important Notes
+
+- The Playwright container image is large. On K8s, consider using a dedicated node or tolerating scheduling delays.
+- Each user needs their own authenticated sessions. At 1,000 users × 3 stores = 3,000 sessions to manage. Sessions expire at different rates per retailer.
+- Scraping must be respectful: randomized intervals, rate limiting, no parallel scraping of the same store for the same user.
+- Receipt data structure varies significantly between retailers. The parsers must be robust and handle edge cases (returns, voided items, weighted produce, BOGO items, coupon stacking).
+- Kroger's public API (`product.compact` scope) can be used to enrich scraped data with UPCs and product metadata after receipt parsing. This is optional but improves product normalization downstream.
+- Store credentials for users should ideally NOT be stored by CartSnitch. Prefer a flow where the user authenticates in a controlled browser session, and we capture/store only the resulting session cookies. If credential storage is necessary, use strong encryption and make the tradeoffs clear to users.
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..bb6300d
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,67 @@
+# Stage 1: Build dependencies
+FROM python:3.12-slim AS build
+
+WORKDIR /app
+
+# git is required to install cartsnitch-common from GitHub; build-essential and
+# libpq-dev are needed to compile any C-extension wheels (e.g. psycopg2 fallback)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    libpq-dev \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY pyproject.toml ./
+COPY src/ ./src/
+
+# cartsnitch-common is not on PyPI — install it directly from GitHub, then
+# install the rest of the package dependencies in a single resolver pass so
+# pip can satisfy the cartsnitch-common>=0.1.0 constraint declared in
+# pyproject.toml without hitting PyPI for it.
+RUN pip install --no-cache-dir --prefix=/install \
+    "cartsnitch-common @ git+https://github.com/cartsnitch/common.git@76685ed0384103228cd670b477b967e7752ebe6b" \
+    .
+
+# Stage 2: Production image with Playwright + Chromium
+FROM python:3.12-slim AS prod
+
+WORKDIR /app
+
+# Install Playwright system dependencies for Chromium
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libnss3 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    libcups2 \
+    libdrm2 \
+    libxkbcommon0 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxrandr2 \
+    libgbm1 \
+    libpango-1.0-0 \
+    libcairo2 \
+    libasound2 \
+    libxshmfence1 \
+    libx11-xcb1 \
+    libxcb-dri3-0 \
+    fonts-liberation \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN adduser --system --group --uid 1000 app
+
+COPY --from=build /install /usr/local
+COPY src/ ./src/
+
+# Install Playwright Chromium browser (runs as root; /opt/playwright is world-readable)
+RUN PLAYWRIGHT_BROWSERS_PATH=/opt/playwright playwright install chromium
+
+ENV PLAYWRIGHT_BROWSERS_PATH=/opt/playwright
+
+USER 1000
+EXPOSE 8000
+
+HEALTHCHECK --interval=30s --timeout=3s \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"
+
+CMD ["uvicorn", "receiptwitness.main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/renovate.json b/renovate.json
new file mode 100644
index 0000000..833ba3b
--- /dev/null
+++ b/renovate.json
@@ -0,0 +1,4 @@
+{
+  "$schema": "https://docs.renovatebot.com/renovate-schema.json",
+  "extends": ["local>cartsnitch/.github:renovate-config"]
+}
diff --git a/src/receiptwitness/__init__.py b/src/receiptwitness/__init__.py
new file mode 100644
index 0000000..6b17aab
--- /dev/null
+++ b/src/receiptwitness/__init__.py
@@ -0,0 +1 @@
+"""ReceiptWitness — CartSnitch receipt ingestion service."""
diff --git a/src/receiptwitness/api/__init__.py b/src/receiptwitness/api/__init__.py
new file mode 100644
index 0000000..74ded59
--- /dev/null
+++ b/src/receiptwitness/api/__init__.py
@@ -0,0 +1 @@
+"""Internal API for ReceiptWitness service."""
diff --git a/src/receiptwitness/api/routes.py b/src/receiptwitness/api/routes.py
new file mode 100644
index 0000000..23cc109
--- /dev/null
+++ b/src/receiptwitness/api/routes.py
@@ -0,0 +1,10 @@
+"""Internal API routes for triggering scrapes and checking status."""
+
+from fastapi import APIRouter
+
+router = APIRouter()
+
+
+@router.get("/health")
+async def health():
+    return {"status": "ok", "service": "receiptwitness"}
diff --git a/src/receiptwitness/config.py b/src/receiptwitness/config.py
new file mode 100644
index 0000000..1341f3f
--- /dev/null
+++ b/src/receiptwitness/config.py
@@ -0,0 +1,26 @@
+"""Service-specific configuration for ReceiptWitness."""
+
+from pydantic_settings import BaseSettings
+
+
+class ReceiptWitnessSettings(BaseSettings):
+    model_config = {"env_prefix": "RW_"}
+
+    # Inherited from cartsnitch-common
+    database_url: str = "postgresql+asyncpg://cartsnitch:cartsnitch@localhost:5432/cartsnitch"
+    redis_url: str = "redis://localhost:6379/0"
+
+    # Session encryption
+    session_encryption_key: str = ""
+
+    # Scraping defaults
+    scrape_interval_seconds: int = 86400  # 24 hours
+    min_request_delay_ms: int = 1000
+    max_request_delay_ms: int = 5000
+
+    # Playwright
+    headless: bool = True
+    browser_timeout_ms: int = 60000
+
+
+settings = ReceiptWitnessSettings()
diff --git a/src/receiptwitness/events.py b/src/receiptwitness/events.py
new file mode 100644
index 0000000..3d75614
--- /dev/null
+++ b/src/receiptwitness/events.py
@@ -0,0 +1,75 @@
+"""Publish receipt ingestion events to Redis/DragonflyDB pub/sub."""
+
+import json
+import logging
+from datetime import UTC, datetime
+from decimal import Decimal
+
+import redis.asyncio as aioredis
+
+from receiptwitness.config import settings
+
+logger = logging.getLogger(__name__)
+
+CHANNEL_RECEIPTS_INGESTED = "cartsnitch.receipts.ingested"
+
+# Module-level connection pool — shared across all publish calls
+_pool: aioredis.ConnectionPool | None = None
+
+
+class _DecimalEncoder(json.JSONEncoder):
+    def default(self, o):
+        if isinstance(o, Decimal):
+            return float(o)
+        return super().default(o)
+
+
+def _get_pool() -> aioredis.ConnectionPool:
+    """Get or create the shared Redis connection pool."""
+    global _pool
+    if _pool is None:
+        _pool = aioredis.ConnectionPool.from_url(
+            settings.redis_url, decode_responses=True, max_connections=10
+        )
+    return _pool
+
+
+async def get_redis_client() -> aioredis.Redis:
+    """Create an async Redis/DragonflyDB client with connection pooling."""
+    return aioredis.Redis(connection_pool=_get_pool())
+
+
+async def publish_receipt_ingested(
+    user_id: str,
+    store_slug: str,
+    purchase_id: str,
+    purchase_date: str,
+    item_count: int,
+    total: Decimal | float,
+) -> None:
+    """Publish a cartsnitch.receipts.ingested event after successful ingestion."""
+    event = {
+        "event_type": CHANNEL_RECEIPTS_INGESTED,
+        "timestamp": datetime.now(UTC).isoformat(),
+        "service": "receiptwitness",
+        "payload": {
+            "user_id": user_id,
+            "store_slug": store_slug,
+            "purchase_id": purchase_id,
+            "purchase_date": purchase_date,
+            "item_count": item_count,
+            "total": float(total) if isinstance(total, Decimal) else total,
+        },
+    }
+
+    try:
+        client = await get_redis_client()
+        await client.publish(CHANNEL_RECEIPTS_INGESTED, json.dumps(event, cls=_DecimalEncoder))
+        logger.info(
+            "Published %s event for purchase %s",
+            CHANNEL_RECEIPTS_INGESTED,
+            purchase_id,
+        )
+    except aioredis.ConnectionError:
+        logger.error("Failed to publish event — Redis/DragonflyDB connection error")
+        raise
diff --git a/src/receiptwitness/main.py b/src/receiptwitness/main.py
new file mode 100644
index 0000000..55cda42
--- /dev/null
+++ b/src/receiptwitness/main.py
@@ -0,0 +1,8 @@
+"""FastAPI app entrypoint for ReceiptWitness."""
+
+from fastapi import FastAPI
+
+from receiptwitness.api.routes import router
+
+app = FastAPI(title="ReceiptWitness", version="0.1.0")
+app.include_router(router)
diff --git a/src/receiptwitness/parsers/__init__.py b/src/receiptwitness/parsers/__init__.py
new file mode 100644
index 0000000..2b56ce8
--- /dev/null
+++ b/src/receiptwitness/parsers/__init__.py
@@ -0,0 +1 @@
+"""Receipt parsers for each retailer."""
diff --git a/src/receiptwitness/parsers/kroger.py b/src/receiptwitness/parsers/kroger.py
new file mode 100644
index 0000000..13e5a20
--- /dev/null
+++ b/src/receiptwitness/parsers/kroger.py
@@ -0,0 +1,148 @@
+"""Kroger receipt parser.
+
+Transforms raw Kroger receipt JSON into the common PurchaseCreate schema.
+Kroger receipt data uses different field names than Meijer — this parser
+handles Kroger-specific naming conventions and receipt structure.
+"""
+
+import logging
+from decimal import Decimal, InvalidOperation
+
+from receiptwitness.scrapers.base import RawReceipt
+
+logger = logging.getLogger(__name__)
+
+
+def _to_decimal(value, default: str = "0") -> Decimal:
+    """Safely convert a value to Decimal."""
+    if value is None:
+        return Decimal(default)
+    try:
+        return Decimal(str(value))
+    except (InvalidOperation, ValueError, TypeError):
+        return Decimal(default)
+
+
+def _parse_item(item: dict) -> dict:
+    """Parse a single line item from a Kroger receipt.
+
+    Kroger items typically include fields like:
+    - description / itemDescription / productName
+    - upc / krogerProductId
+    - quantity / qty
+    - basePrice / unitPrice / price
+    - totalPrice / extendedAmount / lineTotal
+    - regularPrice / originalPrice
+    - salePrice / promoPrice
+    - couponAmount / couponSavings
+    - loyaltyDiscount / fuelPointsDiscount / plusCardSavings
+    - department / category / aisle
+    """
+    description = (
+        item.get("description")
+        or item.get("itemDescription")
+        or item.get("productName")
+        or item.get("name")
+        or "UNKNOWN ITEM"
+    )
+
+    quantity = _to_decimal(item.get("quantity", item.get("qty", item.get("quantitySold", 1))), "1")
+    unit_price = _to_decimal(item.get("basePrice", item.get("unitPrice", item.get("price", 0))))
+    extended_price = _to_decimal(
+        item.get("totalPrice", item.get("extendedAmount", item.get("lineTotal")))
+    )
+
+    # Compute extended_price if not provided
+    if extended_price == Decimal("0") and unit_price != Decimal("0"):
+        extended_price = unit_price * quantity
+
+    regular_price = item.get("regularPrice", item.get("originalPrice"))
+    sale_price = item.get("salePrice", item.get("promoPrice"))
+    coupon_discount = item.get(
+        "couponAmount", item.get("couponSavings", item.get("couponDiscount"))
+    )
+    loyalty_discount = item.get(
+        "plusCardSavings",
+        item.get("loyaltyDiscount", item.get("fuelPointsDiscount")),
+    )
+
+    # UPC handling — Kroger may use krogerProductId or upc
+    upc = item.get("upc", item.get("UPC", item.get("krogerProductId")))
+    if upc:
+        upc = str(upc).strip().lstrip("0") or None
+
+    category = item.get("department", item.get("category", item.get("aisle")))
+
+    # Weight info for produce/deli items
+    weight = item.get("weight", item.get("netWeight"))
+    extra = {}
+    if weight is not None:
+        extra["weight"] = str(weight)
+        weight_uom = item.get("weightUom", item.get("unitOfMeasure"))
+        if weight_uom:
+            extra["weight_uom"] = weight_uom
+
+    result = {
+        "product_name_raw": description.strip(),
+        "upc": upc,
+        "quantity": quantity,
+        "unit_price": unit_price,
+        "extended_price": extended_price,
+        "regular_price": (_to_decimal(regular_price) if regular_price is not None else None),
+        "sale_price": (_to_decimal(sale_price) if sale_price is not None else None),
+        "coupon_discount": (_to_decimal(coupon_discount) if coupon_discount is not None else None),
+        "loyalty_discount": (
+            _to_decimal(loyalty_discount) if loyalty_discount is not None else None
+        ),
+        "category_raw": category.strip() if category else None,
+    }
+
+    return result
+
+
+def parse_kroger_receipt(raw: RawReceipt) -> dict:
+    """Parse a RawReceipt from Kroger into a PurchaseCreate-compatible dict."""
+    data = raw.raw_data
+    detail = data.get("detail", {})
+
+    # Parse items — Kroger uses "items" or "lineItems" or "receiptItems"
+    raw_items = detail.get("items", detail.get("lineItems", detail.get("receiptItems", [])))
+    items = []
+    for raw_item in raw_items:
+        # Skip voided / returned items
+        if raw_item.get("voided") or raw_item.get("status") in (
+            "VOIDED",
+            "RETURNED",
+        ):
+            logger.debug("Skipping voided/returned item: %s", raw_item.get("description"))
+            continue
+        if raw_item.get("returnFlag") or raw_item.get("isReturn"):
+            logger.debug("Skipping returned item: %s", raw_item.get("description"))
+            continue
+        items.append(_parse_item(raw_item))
+
+    # Parse totals — Kroger uses various field names
+    total = _to_decimal(
+        detail.get(
+            "total",
+            data.get("total", data.get("orderTotal", data.get("grandTotal", 0))),
+        )
+    )
+    subtotal = detail.get("subtotal", data.get("subtotal", data.get("subTotal")))
+    tax = detail.get("tax", data.get("tax", data.get("salesTax")))
+    savings = detail.get(
+        "totalSavings",
+        data.get("savings", data.get("totalDiscount", data.get("youSaved"))),
+    )
+
+    return {
+        "receipt_id": raw.receipt_id,
+        "purchase_date": raw.purchase_date,
+        "total": total,
+        "subtotal": _to_decimal(subtotal) if subtotal is not None else None,
+        "tax": _to_decimal(tax) if tax is not None else None,
+        "savings_total": _to_decimal(savings) if savings is not None else None,
+        "source_url": raw.source_url,
+        "raw_data": data,
+        "items": items,
+    }
diff --git a/src/receiptwitness/parsers/meijer.py b/src/receiptwitness/parsers/meijer.py
new file mode 100644
index 0000000..d1960d0
--- /dev/null
+++ b/src/receiptwitness/parsers/meijer.py
@@ -0,0 +1,138 @@
+"""Parse raw Meijer mPerks receipt data into PurchaseCreate-compatible dicts.
+
+The mPerks receipt JSON structure (reverse-engineered from their SPA)
+typically looks like:
+
+Transaction listing:
+{
+    "transactions": [
+        {
+            "transactionId": "12345",
+            "transactionDate": "2026-03-10T14:30:00Z",
+            "storeNumber": "123",
+            "total": 87.42,
+            "savings": 12.50
+        }
+    ]
+}
+
+Receipt detail:
+{
+    "receiptId": "12345",
+    "items": [
+        {
+            "description": "ORGANIC BANANAS",
+            "upc": "0000000004011",
+            "quantity": 1,
+            "price": 0.69,
+            "extendedPrice": 0.69,
+            "regularPrice": 0.79,
+            "salePrice": 0.69,
+            "couponDiscount": 0.0,
+            "mperksDiscount": 0.10,
+            "category": "PRODUCE"
+        }
+    ],
+    "subtotal": 74.92,
+    "tax": 5.24,
+    "total": 87.42,
+    "totalSavings": 12.50
+}
+"""
+
+import logging
+from decimal import Decimal, InvalidOperation
+
+from receiptwitness.scrapers.base import RawReceipt
+
+logger = logging.getLogger(__name__)
+
+
+def _to_decimal(value, default: str = "0") -> Decimal:
+    """Safely convert a value to Decimal."""
+    if value is None:
+        return Decimal(default)
+    try:
+        return Decimal(str(value))
+    except (InvalidOperation, ValueError, TypeError):
+        return Decimal(default)
+
+
+def _parse_item(item: dict) -> dict:
+    """Parse a single line item from Meijer receipt detail."""
+    description = (
+        item.get("description") or item.get("itemDescription") or item.get("name") or "UNKNOWN ITEM"
+    )
+
+    quantity = _to_decimal(item.get("quantity", item.get("qty", 1)), "1")
+    unit_price = _to_decimal(item.get("price", item.get("unitPrice", 0)))
+    extended_price = _to_decimal(item.get("extendedPrice", item.get("totalPrice")))
+
+    # If extended_price wasn't provided, compute it
+    if extended_price == Decimal("0") and unit_price != Decimal("0"):
+        extended_price = unit_price * quantity
+
+    regular_price = item.get("regularPrice")
+    sale_price = item.get("salePrice")
+    coupon_discount = item.get("couponDiscount", item.get("couponSavings"))
+    loyalty_discount = item.get("mperksDiscount", item.get("loyaltyDiscount"))
+
+    upc = item.get("upc", item.get("UPC"))
+    if upc:
+        upc = str(upc).strip().lstrip("0") or None
+
+    category = item.get("category", item.get("departmentDescription"))
+
+    return {
+        "product_name_raw": description.strip(),
+        "upc": upc,
+        "quantity": quantity,
+        "unit_price": unit_price,
+        "extended_price": extended_price,
+        "regular_price": _to_decimal(regular_price) if regular_price is not None else None,
+        "sale_price": _to_decimal(sale_price) if sale_price is not None else None,
+        "coupon_discount": (_to_decimal(coupon_discount) if coupon_discount is not None else None),
+        "loyalty_discount": (
+            _to_decimal(loyalty_discount) if loyalty_discount is not None else None
+        ),
+        "category_raw": category.strip() if category else None,
+    }
+
+
+def parse_meijer_receipt(raw: RawReceipt) -> dict:
+    """Parse a RawReceipt from Meijer into a PurchaseCreate-compatible dict.
+
+    Returns a dict with keys matching PurchaseCreate schema fields.
+    The caller is responsible for setting store_id and store_location_id
+    from the store registry.
+    """
+    data = raw.raw_data
+    detail = data.get("detail", {})
+
+    # Parse items from the detail response
+    raw_items = detail.get("items", detail.get("lineItems", []))
+    items = []
+    for raw_item in raw_items:
+        # Skip voided items
+        if raw_item.get("voided") or raw_item.get("status") == "VOIDED":
+            logger.debug("Skipping voided item: %s", raw_item.get("description"))
+            continue
+        items.append(_parse_item(raw_item))
+
+    # Parse totals
+    total = _to_decimal(detail.get("total", data.get("total", data.get("transactionTotal", 0))))
+    subtotal = detail.get("subtotal", data.get("subtotal"))
+    tax = detail.get("tax", data.get("tax"))
+    savings = detail.get("totalSavings", data.get("savings", data.get("totalDiscount")))
+
+    return {
+        "receipt_id": raw.receipt_id,
+        "purchase_date": raw.purchase_date,
+        "total": total,
+        "subtotal": _to_decimal(subtotal) if subtotal is not None else None,
+        "tax": _to_decimal(tax) if tax is not None else None,
+        "savings_total": _to_decimal(savings) if savings is not None else None,
+        "source_url": raw.source_url,
+        "raw_data": data,
+        "items": items,
+    }
diff --git a/src/receiptwitness/parsers/target.py b/src/receiptwitness/parsers/target.py
new file mode 100644
index 0000000..25b4204
--- /dev/null
+++ b/src/receiptwitness/parsers/target.py
@@ -0,0 +1,191 @@
+"""Target Circle receipt parser.
+
+Transforms raw Target in-store receipt JSON into the common PurchaseCreate schema.
+Target receipt data includes Circle pricing, BOGO deals, and Circle rewards
+discounts that need special handling.
+
+Target receipt detail structure (reverse-engineered from target.com SPA):
+
+{
+    "orderId": "TGT-2026-0315-7890",
+    "items": [
+        {
+            "description": "GOOD & GATHER WHOLE MILK GAL",
+            "tcin": "14767459",
+            "upc": "0085239100123",
+            "quantity": 1,
+            "unitPrice": 3.89,
+            "totalPrice": 3.89,
+            "regularPrice": 4.19,
+            "circlePrice": 3.89,
+            "couponDiscount": 0.0,
+            "circleRewardsDiscount": 0.30,
+            "promoDescription": "Circle offer: Save 30c",
+            "department": "GROCERY"
+        }
+    ],
+    "subtotal": 78.32,
+    "tax": 4.89,
+    "total": 83.21,
+    "totalSavings": 11.45
+}
+"""
+
+import logging
+from decimal import Decimal, InvalidOperation
+
+from receiptwitness.scrapers.base import RawReceipt
+
+logger = logging.getLogger(__name__)
+
+
+def _to_decimal(value, default: str = "0") -> Decimal:
+    """Safely convert a value to Decimal."""
+    if value is None:
+        return Decimal(default)
+    try:
+        return Decimal(str(value))
+    except (InvalidOperation, ValueError, TypeError):
+        return Decimal(default)
+
+
+def _parse_item(item: dict) -> dict:
+    """Parse a single line item from a Target receipt.
+
+    Target items may include fields like:
+    - description / itemDescription / productName
+    - tcin (Target internal product ID) / upc / dpci
+    - quantity / qty
+    - unitPrice / price
+    - totalPrice / extendedPrice / lineTotal
+    - regularPrice / originalPrice
+    - circlePrice / salePrice / promoPrice
+    - couponDiscount / couponSavings
+    - circleRewardsDiscount / circleDiscount / loyaltyDiscount
+    - promoDescription / offerDescription (e.g. "BOGO 50% off", "Circle offer")
+    - department / category
+    """
+    description = (
+        item.get("description")
+        or item.get("itemDescription")
+        or item.get("productName")
+        or item.get("name")
+        or "UNKNOWN ITEM"
+    )
+
+    quantity = _to_decimal(item.get("quantity", item.get("qty", item.get("quantitySold", 1))), "1")
+    unit_price = _to_decimal(item.get("unitPrice", item.get("price", item.get("basePrice", 0))))
+    extended_price = _to_decimal(
+        item.get("totalPrice", item.get("extendedPrice", item.get("lineTotal")))
+    )
+
+    # Compute extended_price if not provided
+    if extended_price == Decimal("0") and unit_price != Decimal("0"):
+        extended_price = unit_price * quantity
+
+    regular_price = item.get("regularPrice", item.get("originalPrice"))
+    # Target Circle pricing — circlePrice takes precedence over generic salePrice
+    sale_price = item.get("circlePrice", item.get("salePrice", item.get("promoPrice")))
+    coupon_discount = item.get(
+        "couponDiscount", item.get("couponSavings", item.get("couponAmount"))
+    )
+    # Circle rewards / loyalty discount
+    loyalty_discount = item.get(
+        "circleRewardsDiscount",
+        item.get("circleDiscount", item.get("loyaltyDiscount")),
+    )
+
+    # UPC handling — Target may use tcin, upc, or dpci
+    upc = item.get("upc", item.get("UPC"))
+    if upc:
+        upc = str(upc).strip().lstrip("0") or None
+
+    # Target also has TCIN (Target.com Item Number) and DPCI (Department/Class/Item)
+    tcin = item.get("tcin", item.get("TCIN"))
+    dpci = item.get("dpci", item.get("DPCI"))
+
+    category = item.get("department", item.get("category"))
+
+    # Capture promo/deal description for BOGO and Circle offers
+    promo_description = item.get("promoDescription", item.get("offerDescription"))
+
+    # Weight info for produce/deli items
+    weight = item.get("weight", item.get("netWeight"))
+    extra: dict = {}
+    if weight is not None:
+        extra["weight"] = str(weight)
+        weight_uom = item.get("weightUom", item.get("unitOfMeasure"))
+        if weight_uom:
+            extra["weight_uom"] = weight_uom
+    if tcin:
+        extra["tcin"] = str(tcin)
+    if dpci:
+        extra["dpci"] = str(dpci)
+    if promo_description:
+        extra["promo_description"] = promo_description
+
+    result: dict = {
+        "product_name_raw": description.strip(),
+        "upc": upc,
+        "quantity": quantity,
+        "unit_price": unit_price,
+        "extended_price": extended_price,
+        "regular_price": _to_decimal(regular_price) if regular_price is not None else None,
+        "sale_price": _to_decimal(sale_price) if sale_price is not None else None,
+        "coupon_discount": (_to_decimal(coupon_discount) if coupon_discount is not None else None),
+        "loyalty_discount": (
+            _to_decimal(loyalty_discount) if loyalty_discount is not None else None
+        ),
+        "category_raw": category.strip() if category else None,
+    }
+
+    return result
+
+
+def parse_target_receipt(raw: RawReceipt) -> dict:
+    """Parse a RawReceipt from Target into a PurchaseCreate-compatible dict."""
+    data = raw.raw_data
+    detail = data.get("detail", {})
+
+    # Parse items — Target uses "items" or "lineItems"
+    raw_items = detail.get("items", detail.get("lineItems", []))
+    items = []
+    for raw_item in raw_items:
+        # Skip voided / returned items
+        if raw_item.get("voided") or raw_item.get("status") in (
+            "VOIDED",
+            "RETURNED",
+            "CANCELLED",
+        ):
+            logger.debug("Skipping voided/returned item: %s", raw_item.get("description"))
+            continue
+        if raw_item.get("returnFlag") or raw_item.get("isReturn"):
+            logger.debug("Skipping returned item: %s", raw_item.get("description"))
+            continue
+        items.append(_parse_item(raw_item))
+
+    # Parse totals
+    total = _to_decimal(
+        detail.get(
+            "total",
+            data.get("total", data.get("orderTotal", data.get("grandTotal", 0))),
+        )
+    )
+    subtotal = detail.get("subtotal", data.get("subtotal", data.get("subTotal")))
+    tax = detail.get("tax", data.get("tax", data.get("salesTax")))
+    savings = detail.get(
+        "totalSavings",
+        data.get("savings", data.get("totalDiscount", data.get("circleSavings"))),
+    )
+
+    return {
+        "receipt_id": raw.receipt_id,
+        "purchase_date": raw.purchase_date,
+        "total": total,
+        "subtotal": _to_decimal(subtotal) if subtotal is not None else None,
+        "tax": _to_decimal(tax) if tax is not None else None,
+        "savings_total": _to_decimal(savings) if savings is not None else None,
+        "source_url": raw.source_url,
+        "raw_data": data,
+        "items": items,
+    }
diff --git a/src/receiptwitness/pipeline/__init__.py b/src/receiptwitness/pipeline/__init__.py
new file mode 100644
index 0000000..e590387
--- /dev/null
+++ b/src/receiptwitness/pipeline/__init__.py
@@ -0,0 +1,30 @@
+"""Receipt & product matching pipeline — receipt normalization and product dedup."""
+
+from receiptwitness.pipeline.matching import (
+    ConfidenceLevel,
+    ProductMatcher,
+    match_purchase_item,
+)
+from receiptwitness.pipeline.normalization import (
+    MatchMethod,
+    MatchResult,
+    clean_name,
+    extract_size_info,
+    jaccard_similarity,
+    normalize_product,
+)
+from receiptwitness.pipeline.receipt import normalize_receipt, parse_meijer_item
+
+__all__ = [
+    "ConfidenceLevel",
+    "MatchMethod",
+    "MatchResult",
+    "ProductMatcher",
+    "clean_name",
+    "extract_size_info",
+    "jaccard_similarity",
+    "match_purchase_item",
+    "normalize_product",
+    "normalize_receipt",
+    "parse_meijer_item",
+]
diff --git a/src/receiptwitness/pipeline/matching.py b/src/receiptwitness/pipeline/matching.py
new file mode 100644
index 0000000..7e71039
--- /dev/null
+++ b/src/receiptwitness/pipeline/matching.py
@@ -0,0 +1,136 @@
+"""Product matching & dedup — UPC primary, fuzzy name fallback, confidence scoring.
+
+Wraps the Phase 1 normalization module with confidence-level classification
+and batch matching for purchase ingestion.
+"""
+
+import uuid
+from dataclasses import dataclass
+
+from cartsnitch_common.constants import MatchConfidence
+from cartsnitch_common.models.product import NormalizedProduct
+from cartsnitch_common.schemas.purchase import PurchaseItemCreate
+from sqlalchemy.orm import Session
+
+from receiptwitness.pipeline.normalization import (
+    MatchMethod,
+    MatchResult,
+    extract_size_info,
+    normalize_product,
+)
+
+# Re-export for convenience
+ConfidenceLevel = MatchConfidence
+
+
+@dataclass(frozen=True)
+class MatchOutcome:
+    """Result of matching a single purchase item to a normalized product."""
+
+    item_index: int
+    match: MatchResult | None
+    confidence_level: MatchConfidence
+    created_new: bool = False
+
+
+def classify_confidence(score: float, method: MatchMethod) -> MatchConfidence:
+    """Classify a match score into high/medium/low confidence."""
+    if method == MatchMethod.UPC:
+        return MatchConfidence.HIGH
+    # Name-based matching thresholds
+    if score >= 0.8:
+        return MatchConfidence.HIGH
+    if score >= 0.5:
+        return MatchConfidence.MEDIUM
+    return MatchConfidence.LOW
+
+
+def _create_product_from_item(
+    session: Session,
+    item: PurchaseItemCreate,
+) -> NormalizedProduct:
+    """Create a new NormalizedProduct from a purchase item that had no match."""
+    size_info = extract_size_info(item.product_name_raw)
+    product = NormalizedProduct(
+        id=uuid.uuid4(),
+        canonical_name=item.product_name_raw,
+        size=size_info[0] if size_info else None,
+        size_unit=size_info[1] if size_info else None,
+        upc_variants=[item.upc] if item.upc else [],
+    )
+    session.add(product)
+    session.flush()
+    return product
+
+
+class ProductMatcher:
+    """Batch product matcher for purchase ingestion.
+
+    Usage:
+        matcher = ProductMatcher(session)
+        outcomes = matcher.match_items(items)
+    """
+
+    def __init__(
+        self,
+        session: Session,
+        name_threshold: float = 0.4,
+        auto_create: bool = True,
+    ):
+        self.session = session
+        self.name_threshold = name_threshold
+        self.auto_create = auto_create
+
+    def match_single(
+        self,
+        item: PurchaseItemCreate,
+    ) -> tuple[NormalizedProduct | None, MatchResult | None, MatchConfidence]:
+        """Match a single purchase item to a normalized product.
+
+        Returns (product, match_result, confidence_level).
+        If auto_create is True and no match found, creates a new product.
+        """
+        result = normalize_product(
+            self.session,
+            item.product_name_raw,
+            upc=item.upc,
+            name_threshold=self.name_threshold,
+        )
+
+        if result:
+            confidence = classify_confidence(result.confidence, result.method)
+            return result.product, result, confidence
+
+        if self.auto_create:
+            product = _create_product_from_item(self.session, item)
+            return product, None, MatchConfidence.LOW
+
+        return None, None, MatchConfidence.LOW
+
+    def match_items(self, items: list[PurchaseItemCreate]) -> list[MatchOutcome]:
+        """Match a batch of purchase items. Returns outcomes in order."""
+        outcomes: list[MatchOutcome] = []
+        for idx, item in enumerate(items):
+            product, result, confidence = self.match_single(item)
+            created = result is None and product is not None
+            outcomes.append(
+                MatchOutcome(
+                    item_index=idx,
+                    match=result,
+                    confidence_level=confidence,
+                    created_new=created,
+                )
+            )
+        return outcomes
+
+
+def match_purchase_item(
+    session: Session,
+    item: PurchaseItemCreate,
+    name_threshold: float = 0.4,
+    auto_create: bool = True,
+) -> tuple[NormalizedProduct | None, MatchConfidence]:
+    """Convenience function: match a single item, return (product, confidence)."""
+    matcher = ProductMatcher(session, name_threshold=name_threshold, auto_create=auto_create)
+    product, _, confidence = matcher.match_single(item)
+    return product, confidence
diff --git a/src/receiptwitness/pipeline/normalization.py b/src/receiptwitness/pipeline/normalization.py
new file mode 100644
index 0000000..c1fade9
--- /dev/null
+++ b/src/receiptwitness/pipeline/normalization.py
@@ -0,0 +1,155 @@
+"""Product normalization — Phase 1: UPC matching + fuzzy name matching.
+
+Matches products across retailers by:
+1. Exact UPC match (highest confidence)
+2. Fuzzy name matching via token-based Jaccard similarity (lower confidence)
+"""
+
+import re
+from dataclasses import dataclass
+from enum import StrEnum
+
+from cartsnitch_common.models.product import NormalizedProduct
+from sqlalchemy import select
+from sqlalchemy.orm import Session
+
+
+class MatchMethod(StrEnum):
+    """How a product match was determined."""
+
+    UPC = "upc"
+    NAME = "name"
+
+
+@dataclass(frozen=True)
+class MatchResult:
+    """Result of a product normalization attempt."""
+
+    product: NormalizedProduct
+    confidence: float
+    method: MatchMethod
+
+
+# Noise words stripped during name cleaning
+_NOISE_WORDS = frozenset(
+    {
+        "the",
+        "a",
+        "an",
+        "and",
+        "or",
+        "of",
+        "with",
+        "in",
+        "for",
+        "to",
+        "brand",
+        "original",
+        "classic",
+        "new",
+        "improved",
+    }
+)
+
+# Regex for extracting size info (e.g., "16 oz", "1.5 lb", "12 ct")
+_SIZE_PATTERN = re.compile(
+    r"(\d+(?:\.\d+)?)\s*(oz|fl\s*oz|lb|lbs|g|kg|ml|l|ct|pk|count|pack)\b",
+    re.IGNORECASE,
+)
+
+
+def clean_name(name: str) -> str:
+    """Normalize a product name for comparison.
+
+    - Lowercase
+    - Remove size info (e.g., "16 oz")
+    - Strip noise words
+    - Collapse whitespace
+    """
+    cleaned = name.lower()
+    cleaned = _SIZE_PATTERN.sub("", cleaned)
+    cleaned = re.sub(r"[^\w\s]", " ", cleaned)
+    tokens = cleaned.split()
+    tokens = [t for t in tokens if t not in _NOISE_WORDS]
+    return " ".join(tokens)
+
+
+def extract_size_info(name: str) -> tuple[str, str] | None:
+    """Extract (size, unit) from a product name, if present."""
+    match = _SIZE_PATTERN.search(name)
+    if match:
+        return match.group(1), match.group(2).lower().replace(" ", "_")
+    return None
+
+
+def jaccard_similarity(a: str, b: str) -> float:
+    """Token-based Jaccard similarity between two cleaned names."""
+    tokens_a = set(a.split())
+    tokens_b = set(b.split())
+    if not tokens_a or not tokens_b:
+        return 0.0
+    intersection = tokens_a & tokens_b
+    union = tokens_a | tokens_b
+    return len(intersection) / len(union)
+
+
+def match_by_upc(session: Session, upc: str) -> MatchResult | None:
+    """Find a normalized product by exact UPC match.
+
+    Loads products with upc_variants and checks membership in Python
+    for cross-database compatibility (works on both PostgreSQL and SQLite).
+    """
+    # TODO: Use PostgreSQL JSON containment query (@>) for production.
+    # Current approach loads all products into memory — acceptable for tests
+    # and small datasets, but will not scale.
+    stmt = select(NormalizedProduct).where(NormalizedProduct.upc_variants.is_not(None))
+    products = session.execute(stmt).scalars().all()
+    for product in products:
+        if product.upc_variants and upc in product.upc_variants:
+            return MatchResult(product=product, confidence=1.0, method=MatchMethod.UPC)
+    return None
+
+
+def match_by_name(
+    session: Session,
+    name: str,
+    threshold: float = 0.5,
+) -> MatchResult | None:
+    """Find the best normalized product by fuzzy name matching.
+
+    Loads all normalized products and computes Jaccard similarity.
+    Returns the best match above the threshold, or None.
+    """
+    # TODO: Use pg_trgm similarity index for production.
+    # Current approach loads all products into memory — acceptable for tests
+    # and small datasets, but will not scale.
+    cleaned = clean_name(name)
+    stmt = select(NormalizedProduct)
+    products = session.execute(stmt).scalars().all()
+
+    best_match: NormalizedProduct | None = None
+    best_score = 0.0
+
+    for product in products:
+        score = jaccard_similarity(cleaned, clean_name(product.canonical_name))
+        if score > best_score and score >= threshold:
+            best_score = score
+            best_match = product
+
+    if best_match:
+        return MatchResult(product=best_match, confidence=best_score, method=MatchMethod.NAME)
+    return None
+
+
+def normalize_product(
+    session: Session,
+    name: str,
+    upc: str | None = None,
+    name_threshold: float = 0.5,
+) -> MatchResult | None:
+    """Full normalization pipeline: UPC first, then fuzzy name fallback."""
+    if upc:
+        result = match_by_upc(session, upc)
+        if result:
+            return result
+    return match_by_name(session, name, threshold=name_threshold)
diff --git a/src/receiptwitness/pipeline/receipt.py b/src/receiptwitness/pipeline/receipt.py
new file mode 100644
index 0000000..7d3e863
--- /dev/null
+++ b/src/receiptwitness/pipeline/receipt.py
@@ -0,0 +1,144 @@
+"""Receipt normalization — parse raw Meijer scraper output into purchase records.
+
+Maps raw receipt fields, cleans product names, extracts quantities/units.
+"""
+
+import re
+from datetime import date
+from decimal import Decimal, InvalidOperation
+
+from cartsnitch_common.schemas.purchase import PurchaseCreate, PurchaseItemCreate
+
+
+def _clean_product_name(raw: str) -> str:
+    """Clean raw product name from scraper output."""
+    cleaned = raw.strip()
+    # Remove leading/trailing non-alphanumeric chars
+    cleaned = re.sub(r"^\W+|\W+$", "", cleaned)
+    # Collapse internal whitespace
+    cleaned = re.sub(r"\s+", " ", cleaned)
+    return cleaned
+
+
+def _safe_decimal(
+    value: str | float | int | Decimal | None,
+    default: Decimal = Decimal("0"),
+) -> Decimal:
+    """Safely convert a value to Decimal."""
+    if value is None:
+        return default
+    try:
+        return Decimal(str(value))
+    except (InvalidOperation, ValueError):
+        return default
+
+
+def parse_meijer_item(raw_item: dict) -> PurchaseItemCreate:
+    """Parse a single Meijer scraper line item into a PurchaseItemCreate.
+
+    Expected raw_item keys (from Meijer scraper):
+      - description / name: product name
+      - upc / upcCode: UPC barcode
+      - quantity / qty: number of units
+      - unitPrice / price: per-unit price
+      - extendedPrice / totalPrice: line total
+      - regularPrice: shelf price before discounts
+      - salePrice: sale price if applicable
+      - couponAmount / couponDiscount: coupon savings
+      - loyaltyAmount / loyaltyDiscount: loyalty savings
+      - category / department: raw category
+    """
+    name = raw_item.get("description") or raw_item.get("name") or ""
+    cleaned_name = _clean_product_name(name)
+
+    upc = raw_item.get("upc") or raw_item.get("upcCode")
+    if upc:
+        upc = str(upc).strip().lstrip("0") or str(upc).strip()
+
+    qty = _safe_decimal(
+        raw_item.get("quantity") or raw_item.get("qty"),
+        default=Decimal("1"),
+    )
+
+    unit_price = _safe_decimal(raw_item.get("unitPrice") or raw_item.get("price"))
+    extended = _safe_decimal(raw_item.get("extendedPrice") or raw_item.get("totalPrice"))
+    if extended == Decimal("0") and unit_price > 0:
+        extended = unit_price * qty
+
+    regular = raw_item.get("regularPrice")
+    sale = raw_item.get("salePrice")
+    coupon = raw_item.get("couponAmount") or raw_item.get("couponDiscount")
+    loyalty = raw_item.get("loyaltyAmount") or raw_item.get("loyaltyDiscount")
+    category = raw_item.get("category") or raw_item.get("department")
+
+    return PurchaseItemCreate(
+        product_name_raw=cleaned_name,
+        upc=upc,
+        quantity=qty,
+        unit_price=unit_price,
+        extended_price=extended,
+        regular_price=_safe_decimal(regular) if regular is not None else None,
+        sale_price=_safe_decimal(sale) if sale is not None else None,
+        coupon_discount=_safe_decimal(coupon) if coupon is not None else None,
+        loyalty_discount=_safe_decimal(loyalty) if loyalty is not None else None,
+        category_raw=str(category).strip() if category else None,
+    )
+
+
+def normalize_receipt(
+    raw_receipt: dict,
+    user_id: str,
+    store_id: str,
+) -> PurchaseCreate:
+    """Parse a complete Meijer raw receipt into a PurchaseCreate.
+
+    Expected raw_receipt keys:
+      - receiptId / receipt_id / id: unique receipt identifier
+      - date / purchaseDate / purchase_date: purchase date (YYYY-MM-DD or similar)
+      - total / totalAmount: receipt total
+      - subtotal: pre-tax subtotal
+      - tax / taxAmount: tax amount
+      - savings / totalSavings: total discount savings
+      - items: list of raw line item dicts
+    """
+    import uuid
+
+    receipt_id = str(
+        raw_receipt.get("receiptId")
+        or raw_receipt.get("receipt_id")
+        or raw_receipt.get("id")
+        or uuid.uuid4()
+    )
+
+    raw_date = (
+        raw_receipt.get("date")
+        or raw_receipt.get("purchaseDate")
+        or raw_receipt.get("purchase_date")
+    )
+    if isinstance(raw_date, str):
+        purchase_date = date.fromisoformat(raw_date[:10])
+    elif isinstance(raw_date, date):
+        purchase_date = raw_date
+    else:
+        purchase_date = date.today()
+
+    total = _safe_decimal(raw_receipt.get("total") or raw_receipt.get("totalAmount"))
+    subtotal = raw_receipt.get("subtotal")
+    tax = raw_receipt.get("tax") or raw_receipt.get("taxAmount")
+    savings = raw_receipt.get("savings") or raw_receipt.get("totalSavings")
+
+    raw_items = raw_receipt.get("items") or []
+    items = [parse_meijer_item(item) for item in raw_items]
+
+    return PurchaseCreate(
+        user_id=uuid.UUID(user_id) if isinstance(user_id, str) else user_id,
+        store_id=uuid.UUID(store_id) if isinstance(store_id, str) else store_id,
+        receipt_id=receipt_id,
+        purchase_date=purchase_date,
+        total=total,
+        subtotal=_safe_decimal(subtotal) if subtotal is not None else None,
+        tax=_safe_decimal(tax) if tax is not None else None,
+        savings_total=_safe_decimal(savings) if savings is not None else None,
+        raw_data=raw_receipt,
+        items=items,
+    )
diff --git a/src/receiptwitness/scrapers/__init__.py b/src/receiptwitness/scrapers/__init__.py
new file mode 100644
index 0000000..cfc8d9e
--- /dev/null
+++ b/src/receiptwitness/scrapers/__init__.py
@@ -0,0 +1 @@
+"""Retailer scrapers."""
diff --git a/src/receiptwitness/scrapers/base.py b/src/receiptwitness/scrapers/base.py
new file mode 100644
index 0000000..fd5fdc3
--- /dev/null
+++ b/src/receiptwitness/scrapers/base.py
@@ -0,0 +1,72 @@
+"""Abstract base scraper interface for all retailer scrapers."""
+
+import asyncio
+import random
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from datetime import datetime
+
+from receiptwitness.config import settings
+
+
+@dataclass
+class SessionData:
+    """Holds session cookies and metadata for a retailer login."""
+
+    cookies: list[dict]
+    user_agent: str
+    created_at: datetime
+    expires_at: datetime | None = None
+    extra: dict = field(default_factory=dict)
+
+
+@dataclass
+class RawReceipt:
+    """Raw receipt data before parsing."""
+
+    receipt_id: str
+    purchase_date: str
+    store_number: str | None = None
+    raw_data: dict = field(default_factory=dict)
+    source_url: str | None = None
+
+
+class BaseScraper(ABC):
+    """All retailer scrapers implement this interface.
+
+    Provides common functionality: human-like delays, rate limiting guards,
+    and the abstract methods each retailer scraper must implement.
+    """
+
+    @abstractmethod
+    async def login(self, username: str, password: str) -> SessionData:
+        """Authenticate with the retailer portal and return session data."""
+        ...
+
+    @abstractmethod
+    async def check_session(self, session: SessionData) -> bool:
+        """Verify if an existing session is still valid."""
+        ...
+
+    @abstractmethod
+    async def scrape_receipts(
+        self, session: SessionData, since: datetime | None = None
+    ) -> list[RawReceipt]:
+        """Scrape receipt data from the retailer portal."""
+        ...
+
+    @abstractmethod
+    def parse_receipt(self, raw: RawReceipt) -> dict:
+        """Parse a raw receipt into structured data.
+
+        Returns a dict with keys matching PurchaseCreate schema fields,
+        including an 'items' list matching PurchaseItemCreate fields.
+        """
+        ...
+
+    async def human_delay(self, min_ms: int | None = None, max_ms: int | None = None) -> None:
+        """Sleep for a randomized human-like interval."""
+        lo = min_ms or settings.min_request_delay_ms
+        hi = max_ms or settings.max_request_delay_ms
+        delay = random.randint(lo, hi) / 1000.0
+        await asyncio.sleep(delay)
diff --git a/src/receiptwitness/scrapers/kroger.py b/src/receiptwitness/scrapers/kroger.py
new file mode 100644
index 0000000..a7993af
--- /dev/null
+++ b/src/receiptwitness/scrapers/kroger.py
@@ -0,0 +1,344 @@
+"""Kroger loyalty portal scraper using Playwright.
+
+Kroger uses Akamai Bot Manager for aggressive headless browser detection.
+This scraper uses enhanced stealth measures including playwright-stealth,
+realistic fingerprinting, and human-like interaction pacing.
+"""
+
+import logging
+from datetime import UTC, datetime, timedelta
+from typing import cast
+
+from playwright.async_api import BrowserContext, Page, Playwright, async_playwright
+
+from receiptwitness.config import settings
+from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData
+
+logger = logging.getLogger(__name__)
+
+# Kroger endpoints
+KROGER_BASE = "https://www.kroger.com"
+KROGER_LOGIN_PAGE = f"{KROGER_BASE}/signin"
+KROGER_PURCHASE_HISTORY = f"{KROGER_BASE}/mypurchases"
+KROGER_RECEIPT_API = f"{KROGER_BASE}/atlas/v1/purchase-history/api"
+KROGER_RECEIPT_DETAIL_API = f"{KROGER_BASE}/atlas/v1/receipt/api"
+KROGER_ACCOUNT_PAGE = f"{KROGER_BASE}/account/dashboard"
+
+# Realistic browser fingerprint — Chrome on Windows (matches Kroger's typical audience)
+DEFAULT_USER_AGENT = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) "
+    "Chrome/131.0.0.0 Safari/537.36"
+)
+DEFAULT_VIEWPORT = {"width": 1920, "height": 1080}
+DEFAULT_LOCALE = "en-US"
+DEFAULT_TIMEZONE = "America/New_York"
+
+
+class KrogerScraper(BaseScraper):
+    """Scraper for Kroger loyalty purchase history.
+
+    Kroger uses Akamai Bot Manager which aggressively detects headless
+    browsers. This scraper employs enhanced stealth measures:
+    - Masks webdriver/automation signals
+    - Sets realistic browser fingerprint
+    - Uses human-like interaction pacing
+    - Preserves browser context across sessions
+    """
+
+    async def _create_stealth_context(
+        self, playwright_instance: Playwright, cookies: list[dict] | None = None
+    ) -> BrowserContext:
+        """Create a browser context with enhanced stealth for Akamai evasion."""
+        browser = await playwright_instance.chromium.launch(
+            headless=settings.headless,
+            args=[
+                "--disable-blink-features=AutomationControlled",
+                "--no-sandbox",
+                "--disable-dev-shm-usage",
+                "--disable-infobars",
+                "--window-size=1920,1080",
+            ],
+        )
+        context = await browser.new_context(
+            user_agent=DEFAULT_USER_AGENT,
+            viewport=DEFAULT_VIEWPORT,  # type: ignore[arg-type]
+            locale=DEFAULT_LOCALE,
+            timezone_id=DEFAULT_TIMEZONE,
+            java_script_enabled=True,
+            bypass_csp=False,
+            color_scheme="light",
+            has_touch=False,
+        )
+
+        # Enhanced stealth script targeting Akamai Bot Manager detection vectors
+        await context.add_init_script(
+            """
+            // Mask webdriver flag
+            Object.defineProperty(navigator, 'webdriver', {
+                get: () => undefined
+            });
+
+            // Chrome runtime object
+            window.chrome = {
+                runtime: {},
+                loadTimes: function() {},
+                csi: function() {},
+                app: { isInstalled: false }
+            };
+
+            // Realistic plugin array
+            Object.defineProperty(navigator, 'plugins', {
+                get: () => [1, 2, 3, 4, 5]
+            });
+
+            // Languages
+            Object.defineProperty(navigator, 'languages', {
+                get: () => ['en-US', 'en']
+            });
+
+            // Platform
+            Object.defineProperty(navigator, 'platform', {
+                get: () => 'Win32'
+            });
+
+            // Hardware concurrency
+            Object.defineProperty(navigator, 'hardwareConcurrency', {
+                get: () => 8
+            });
+
+            // Device memory
+            Object.defineProperty(navigator, 'deviceMemory', {
+                get: () => 8
+            });
+
+            // Permissions query override (Akamai checks this)
+            const originalQuery = window.navigator.permissions.query;
+            window.navigator.permissions.query = (parameters) =>
+                parameters.name === 'notifications'
+                    ? Promise.resolve({ state: Notification.permission })
+                    : originalQuery(parameters);
+
+            // WebGL vendor/renderer (avoid "Google Inc." / "ANGLE" tells)
+            const getParameter = WebGLRenderingContext.prototype.getParameter;
+            WebGLRenderingContext.prototype.getParameter = function(parameter) {
+                if (parameter === 37445) return 'Intel Inc.';
+                if (parameter === 37446) return 'Intel Iris OpenGL Engine';
+                return getParameter.call(this, parameter);
+            };
+            """
+        )
+
+        if cookies:
+            await context.add_cookies(cookies)  # type: ignore[arg-type]
+
+        return cast(BrowserContext, context)
+
+    async def login(self, username: str, password: str) -> SessionData:
+        """Log in to Kroger and capture session cookies."""
+        async with async_playwright() as p:
+            context = await self._create_stealth_context(p)
+            page = await context.new_page()
+            try:
+                return await self._perform_login(page, context, username, password)
+            finally:
+                if context.browser:
+                    await context.browser.close()
+
+    async def _perform_login(
+        self, page: Page, context: BrowserContext, username: str, password: str
+    ) -> SessionData:
+        """Execute the Kroger login flow."""
+        logger.info("Navigating to Kroger sign-in page")
+        await page.goto(KROGER_LOGIN_PAGE, wait_until="networkidle")
+        await self.human_delay(2000, 4000)
+
+        # Kroger login form — email/username field
+        email_input = page.locator(
+            'input[id="SignIn-emailInput"], '
+            'input[name="email"], '
+            'input[type="email"], '
+            'input[data-testid="SignIn-emailInput"]'
+        )
+        await email_input.wait_for(state="visible", timeout=settings.browser_timeout_ms)
+        await email_input.click()
+        await self.human_delay(300, 700)
+        await email_input.fill(username)
+        await self.human_delay(800, 1500)
+
+        # Password field
+        password_input = page.locator(
+            'input[id="SignIn-passwordInput"], '
+            'input[name="password"], '
+            'input[type="password"], '
+            'input[data-testid="SignIn-passwordInput"]'
+        )
+        await password_input.wait_for(state="visible", timeout=settings.browser_timeout_ms)
+        await password_input.click()
+        await self.human_delay(300, 700)
+        await password_input.fill(password)
+        await self.human_delay(1000, 2000)
+
+        # Sign-in button
+        sign_in_btn = page.locator(
+            'button[id="SignIn-submitButton"], '
+            'button[data-testid="SignIn-submitButton"], '
+            'button[type="submit"]:has-text("Sign In")'
+        )
+        await sign_in_btn.click()
+
+        # Wait for redirect away from sign-in page
+        await page.wait_for_url(
+            lambda url: "signin" not in url.lower(),
+            timeout=settings.browser_timeout_ms,
+        )
+        await self.human_delay(1500, 3000)
+
+        # Capture cookies
+        raw_cookies = await context.cookies()
+        cookies = [dict(c) for c in raw_cookies]
+        now = datetime.now(UTC)
+
+        logger.info("Kroger login successful, captured %d cookies", len(cookies))
+        return SessionData(
+            cookies=cookies,
+            user_agent=DEFAULT_USER_AGENT,
+            created_at=now,
+            expires_at=now + timedelta(hours=2),
+            extra={"retailer": "kroger"},
+        )
+
+    async def check_session(self, session: SessionData) -> bool:
+        """Check if the Kroger session is still valid."""
+        if session.expires_at and datetime.now(UTC) > session.expires_at:
+            logger.info("Kroger session expired based on timestamp")
+            return False
+
+        async with async_playwright() as p:
+            context = await self._create_stealth_context(p, cookies=session.cookies)
+            page = await context.new_page()
+            try:
+                response = await page.goto(KROGER_ACCOUNT_PAGE, wait_until="networkidle")
+                current_url = page.url.lower()
+                is_valid = "signin" not in current_url and response is not None and response.ok
+                logger.info("Kroger session check: valid=%s (url=%s)", is_valid, page.url)
+                return is_valid
+            except Exception:
+                logger.exception("Kroger session check failed")
+                return False
+            finally:
+                if context.browser:
+                    await context.browser.close()
+
+    async def scrape_receipts(
+        self, session: SessionData, since: datetime | None = None
+    ) -> list[RawReceipt]:
+        """Scrape purchase history from Kroger."""
+        async with async_playwright() as p:
+            context = await self._create_stealth_context(p, cookies=session.cookies)
+            page = await context.new_page()
+            try:
+                return await self._fetch_receipts(page, since)
+            finally:
+                if context.browser:
+                    await context.browser.close()
+
+    async def _fetch_receipts(self, page: Page, since: datetime | None) -> list[RawReceipt]:
+        """Fetch receipt list and details from Kroger purchase history."""
+        # Navigate to purchase history to establish context
+        await page.goto(KROGER_PURCHASE_HISTORY, wait_until="networkidle")
+        await self.human_delay(1500, 3000)
+
+        receipts: list[RawReceipt] = []
+
+        # Kroger purchase history API endpoint
+        api_response = await page.request.get(KROGER_RECEIPT_API)
+        if not api_response.ok:
+            logger.warning(
+                "Kroger purchase history request failed: %d %s",
+                api_response.status,
+                api_response.status_text,
+            )
+            return []
+
+        response = await api_response.json()
+        if not isinstance(response, dict):
+            logger.warning("Unexpected purchase history response type: %s", type(response))
+            return []
+
+        # Handle Kroger's response structure
+        orders = response.get("orders", response.get("purchases", []))
+        if not isinstance(orders, list):
+            logger.warning("No orders found in Kroger purchase history response")
+            return []
+
+        logger.info("Found %d orders in Kroger purchase history", len(orders))
+
+        for order in orders:
+            raw_id = order.get("orderId") or order.get("receiptId") or order.get("id") or ""
+            order_id = str(raw_id)
+            purchase_date = order.get(
+                "purchaseDate", order.get("transactionDate", order.get("date", ""))
+            )
+
+            # Filter by date if 'since' is provided
+            if since and purchase_date:
+                try:
+                    txn_dt = datetime.fromisoformat(purchase_date.replace("Z", "+00:00"))
+                    if txn_dt < since:
+                        continue
+                except (ValueError, TypeError):
+                    pass
+
+            if not order_id:
+                continue
+
+            await self.human_delay(1000, 2500)
+
+            # Fetch receipt detail
+            detail = await self._fetch_receipt_detail(page, order_id)
+
+            raw_store = (
+                order.get("storeNumber")
+                or order.get("divisionNumber")
+                or order.get("storeId")
+                or ""
+            )
+            store_number = str(raw_store)
+
+            receipts.append(
+                RawReceipt(
+                    receipt_id=order_id,
+                    purchase_date=purchase_date,
+                    store_number=store_number,
+                    raw_data={**order, "detail": detail},
+                    source_url=f"{KROGER_RECEIPT_DETAIL_API}?orderId={order_id}",
+                )
+            )
+
+        logger.info("Scraped %d receipts from Kroger", len(receipts))
+        return receipts
+
+    async def _fetch_receipt_detail(self, page: Page, order_id: str) -> dict:
+        """Fetch detailed receipt data for a single Kroger order."""
+        try:
+            url = f"{KROGER_RECEIPT_DETAIL_API}?orderId={order_id}"
+            api_response = await page.request.get(url)
+            if not api_response.ok:
+                logger.warning(
+                    "Kroger receipt detail request failed for %s: %d",
+                    order_id,
+                    api_response.status,
+                )
+                return {}
+            detail = await api_response.json()
+            return detail if isinstance(detail, dict) else {}
+        except Exception:
+            logger.exception("Failed to fetch Kroger receipt detail for %s", order_id)
+            return {}
+
+    def parse_receipt(self, raw: RawReceipt) -> dict:
+        """Parse raw Kroger receipt into structured purchase data."""
+        from receiptwitness.parsers.kroger import parse_kroger_receipt
+
+        return parse_kroger_receipt(raw)
diff --git a/src/receiptwitness/scrapers/meijer.py b/src/receiptwitness/scrapers/meijer.py
new file mode 100644
index 0000000..4a4dd8e
--- /dev/null
+++ b/src/receiptwitness/scrapers/meijer.py
@@ -0,0 +1,301 @@
+"""Meijer mPerks scraper using Playwright.
+
+Meijer has no public API. We reverse-engineer the XHR endpoints the mPerks
+web app uses to pull purchase history and receipt data. The flow:
+
+1. Launch stealth Playwright browser
+2. Navigate to mPerks login page and authenticate
+3. Capture session cookies after successful login
+4. Use those cookies to hit the mPerks receipt API endpoints directly
+5. Parse receipt JSON into structured PurchaseCreate records
+
+Key endpoints (reverse-engineered from mPerks SPA):
+- Login:       POST https://www.meijer.com/bin/meijer/account/login
+- Receipts:    GET  https://www.meijer.com/bin/meijer/profile/purchasehistory
+- Receipt detail: GET https://www.meijer.com/bin/meijer/profile/receipt?receiptId=...
+"""
+
+import logging
+from datetime import UTC, datetime, timedelta
+from typing import cast
+
+from playwright.async_api import BrowserContext, Page, Playwright, async_playwright
+
+from receiptwitness.config import settings
+from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData
+
+logger = logging.getLogger(__name__)
+
+# Meijer mPerks URLs
+MEIJER_BASE = "https://www.meijer.com"
+MEIJER_LOGIN_PAGE = f"{MEIJER_BASE}/shopping/login.html"
+MEIJER_LOGIN_API = f"{MEIJER_BASE}/bin/meijer/account/login"
+MEIJER_PURCHASE_HISTORY = f"{MEIJER_BASE}/bin/meijer/profile/purchasehistory"
+MEIJER_RECEIPT_DETAIL = f"{MEIJER_BASE}/bin/meijer/profile/receipt"
+MEIJER_MPERKS_HOME = f"{MEIJER_BASE}/mperks.html"
+
+# Realistic browser fingerprint
+DEFAULT_USER_AGENT = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) "
+    "Chrome/131.0.0.0 Safari/537.36"
+)
+DEFAULT_VIEWPORT = {"width": 1920, "height": 1080}
+DEFAULT_LOCALE = "en-US"
+DEFAULT_TIMEZONE = "America/Detroit"  # Meijer HQ is in Grand Rapids, MI
+
+
+class MeijerScraper(BaseScraper):
+    """Scraper for Meijer mPerks purchase history."""
+
+    async def _create_stealth_context(
+        self, playwright_instance: Playwright, cookies: list[dict] | None = None
+    ) -> BrowserContext:
+        """Create a browser context with stealth settings."""
+        browser = await playwright_instance.chromium.launch(
+            headless=settings.headless,
+            args=[
+                "--disable-blink-features=AutomationControlled",
+                "--no-sandbox",
+            ],
+        )
+        context = await browser.new_context(
+            user_agent=DEFAULT_USER_AGENT,
+            viewport=DEFAULT_VIEWPORT,  # type: ignore[arg-type]
+            locale=DEFAULT_LOCALE,
+            timezone_id=DEFAULT_TIMEZONE,
+            java_script_enabled=True,
+            bypass_csp=False,
+        )
+        # Mask webdriver flag
+        await context.add_init_script(
+            """
+            Object.defineProperty(navigator, 'webdriver', {
+                get: () => undefined
+            });
+            // Mask chrome automation indicators
+            window.chrome = { runtime: {} };
+            Object.defineProperty(navigator, 'plugins', {
+                get: () => [1, 2, 3, 4, 5]
+            });
+            Object.defineProperty(navigator, 'languages', {
+                get: () => ['en-US', 'en']
+            });
+            """
+        )
+        if cookies:
+            await context.add_cookies(cookies)  # type: ignore[arg-type]
+        return cast(BrowserContext, context)
+
+    async def login(self, username: str, password: str) -> SessionData:
+        """Log in to Meijer mPerks and capture session cookies.
+
+        The mPerks login flow:
+        1. Navigate to login page
+        2. Fill email and password fields
+        3. Click sign-in button
+        4. Wait for redirect to mPerks dashboard
+        5. Extract session cookies
+        """
+        async with async_playwright() as p:
+            context = await self._create_stealth_context(p)
+            page = await context.new_page()
+            try:
+                return await self._perform_login(page, context, username, password)
+            finally:
+                if context.browser:
+                    await context.browser.close()
+
+    async def _perform_login(
+        self, page: Page, context: BrowserContext, username: str, password: str
+    ) -> SessionData:
+        """Execute the login flow on the mPerks portal."""
+        logger.info("Navigating to Meijer login page")
+        await page.goto(MEIJER_LOGIN_PAGE, wait_until="networkidle")
+        await self.human_delay(1500, 3000)
+
+        # Fill email field
+        email_input = page.locator('input[type="email"], input[name="email"], #email')
+        await email_input.wait_for(state="visible", timeout=settings.browser_timeout_ms)
+        await email_input.click()
+        await self.human_delay(200, 500)
+        await email_input.fill(username)
+        await self.human_delay(500, 1000)
+
+        # Fill password field
+        password_input = page.locator('input[type="password"], input[name="password"], #password')
+        await password_input.wait_for(state="visible", timeout=settings.browser_timeout_ms)
+        await password_input.click()
+        await self.human_delay(200, 500)
+        await password_input.fill(password)
+        await self.human_delay(500, 1500)
+
+        # Click sign-in button
+        sign_in_btn = page.locator(
+            'button[type="submit"], button:has-text("Sign In"), button:has-text("Log In")'
+        )
+        await sign_in_btn.click()
+
+        # Wait for navigation after login
+        await page.wait_for_url(
+            lambda url: "login" not in url.lower(),
+            timeout=settings.browser_timeout_ms,
+        )
+        await self.human_delay(1000, 2000)
+
+        # Capture cookies
+        raw_cookies = await context.cookies()
+        cookies = [dict(c) for c in raw_cookies]
+        now = datetime.now(UTC)
+
+        logger.info("Meijer login successful, captured %d cookies", len(cookies))
+        return SessionData(
+            cookies=cookies,
+            user_agent=DEFAULT_USER_AGENT,
+            created_at=now,
+            expires_at=now + timedelta(hours=4),
+        )
+
+    async def check_session(self, session: SessionData) -> bool:
+        """Check if the mPerks session is still valid.
+
+        Makes a lightweight request to the mPerks home page and checks
+        if we get redirected to login (session expired) or not.
+        """
+        if session.expires_at and datetime.now(UTC) > session.expires_at:
+            logger.info("Meijer session expired based on timestamp")
+            return False
+
+        async with async_playwright() as p:
+            context = await self._create_stealth_context(p, cookies=session.cookies)
+            page = await context.new_page()
+            try:
+                response = await page.goto(MEIJER_MPERKS_HOME, wait_until="networkidle")
+                current_url = page.url.lower()
+                is_valid = "login" not in current_url and response is not None and response.ok
+                logger.info("Meijer session check: valid=%s (url=%s)", is_valid, page.url)
+                return is_valid
+            except Exception:
+                logger.exception("Meijer session check failed")
+                return False
+            finally:
+                if context.browser:
+                    await context.browser.close()
+
+    async def scrape_receipts(
+        self, session: SessionData, since: datetime | None = None
+    ) -> list[RawReceipt]:
+        """Scrape purchase history from Meijer mPerks.
+
+        Uses the XHR endpoints the mPerks SPA calls to fetch receipt data.
+        The purchase history endpoint returns a list of recent transactions,
+        and we can fetch individual receipt details for line items.
+        """
+        async with async_playwright() as p:
+            context = await self._create_stealth_context(p, cookies=session.cookies)
+            page = await context.new_page()
+            try:
+                return await self._fetch_receipts(page, since)
+            finally:
+                if context.browser:
+                    await context.browser.close()
+
+    async def _fetch_receipts(self, page: Page, since: datetime | None) -> list[RawReceipt]:
+        """Fetch receipt list and detail via mPerks XHR endpoints.
+
+        Uses Playwright's page.request API (APIRequestContext) instead of
+        page.evaluate(fetch(...)) for better observability — requests show up
+        in Playwright traces and can be intercepted by route handlers.
+        """
+        # Navigate to mPerks to establish context (cookies need domain context)
+        await page.goto(MEIJER_MPERKS_HOME, wait_until="networkidle")
+        await self.human_delay(1000, 2000)
+
+        receipts: list[RawReceipt] = []
+
+        # Fetch purchase history listing via page.request (APIRequestContext)
+        api_response = await page.request.get(MEIJER_PURCHASE_HISTORY)
+        if not api_response.ok:
+            logger.warning(
+                "Purchase history request failed: %d %s",
+                api_response.status,
+                api_response.status_text,
+            )
+            return []
+
+        response = await api_response.json()
+
+        if not isinstance(response, dict):
+            logger.warning("Unexpected purchase history response type: %s", type(response))
+            return []
+
+        transactions = response.get("transactions", response.get("purchaseHistory", []))
+        if not isinstance(transactions, list):
+            logger.warning("No transactions found in purchase history response")
+            return []
+
+        logger.info("Found %d transactions in Meijer purchase history", len(transactions))
+
+        for txn in transactions:
+            receipt_id = str(txn.get("transactionId", txn.get("receiptId", "")))
+            purchase_date = txn.get("transactionDate", txn.get("purchaseDate", ""))
+
+            # Filter by date if 'since' is provided
+            if since and purchase_date:
+                try:
+                    txn_dt = datetime.fromisoformat(purchase_date.replace("Z", "+00:00"))
+                    if txn_dt < since:
+                        continue
+                except (ValueError, TypeError):
+                    pass
+
+            if not receipt_id:
+                continue
+
+            await self.human_delay(800, 2000)
+
+            # Fetch receipt detail
+            detail = await self._fetch_receipt_detail(page, receipt_id)
+
+            receipts.append(
+                RawReceipt(
+                    receipt_id=receipt_id,
+                    purchase_date=purchase_date,
+                    store_number=str(txn.get("storeNumber", txn.get("storeId", ""))),
+                    raw_data={**txn, "detail": detail},
+                    source_url=f"{MEIJER_RECEIPT_DETAIL}?receiptId={receipt_id}",
+                )
+            )
+
+        logger.info("Scraped %d receipts from Meijer", len(receipts))
+        return receipts
+
+    async def _fetch_receipt_detail(self, page: Page, receipt_id: str) -> dict:
+        """Fetch detailed receipt data for a single transaction.
+
+        Uses Playwright's page.request API for traceability.
+        """
+        try:
+            url = f"{MEIJER_RECEIPT_DETAIL}?receiptId={receipt_id}"
+            api_response = await page.request.get(url)
+            if not api_response.ok:
+                logger.warning(
+                    "Receipt detail request failed for %s: %d",
+                    receipt_id,
+                    api_response.status,
+                )
+                return {}
+            detail = await api_response.json()
+            return detail if isinstance(detail, dict) else {}
+        except Exception:
+            logger.exception("Failed to fetch receipt detail for %s", receipt_id)
+            return {}
+
+    def parse_receipt(self, raw: RawReceipt) -> dict:
+        """Parse raw Meijer receipt into structured purchase data.
+
+        Delegates to the dedicated parser module.
+        """
+        from receiptwitness.parsers.meijer import parse_meijer_receipt
+
+        return parse_meijer_receipt(raw)
diff --git a/src/receiptwitness/scrapers/target.py b/src/receiptwitness/scrapers/target.py
new file mode 100644
index 0000000..1f959a6
--- /dev/null
+++ b/src/receiptwitness/scrapers/target.py
@@ -0,0 +1,326 @@
+"""Target Circle scraper using Playwright.
+
+Target stores ~1 year of in-store purchase history tied to Circle accounts.
+Purchases appear when the user pays with a linked card, uses the Target app
+wallet, or enters their Circle phone number at checkout.
+
+Key endpoints (reverse-engineered from target.com SPA):
+- Login:          POST https://gsp.target.com/gsp/authentications/v1/auth_codes
+- Order history:  GET  https://api.target.com/order_history/v1/orders (in-store tab)
+- Receipt detail: GET  https://api.target.com/order_history/v1/orders/{orderId}
+"""
+
+import logging
+from datetime import UTC, datetime, timedelta
+from typing import cast
+
+from playwright.async_api import BrowserContext, Page, Playwright, async_playwright
+
+from receiptwitness.config import settings
+from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData
+
+logger = logging.getLogger(__name__)
+
+# Target endpoints
+TARGET_BASE = "https://www.target.com"
+TARGET_LOGIN_PAGE = f"{TARGET_BASE}/login"
+TARGET_ACCOUNT_PAGE = f"{TARGET_BASE}/account"
+TARGET_ORDER_HISTORY = f"{TARGET_BASE}/account/orders"
+TARGET_ORDER_API = "https://api.target.com/order_history/v1/orders"
+TARGET_RECEIPT_API = "https://api.target.com/order_history/v1/orders"
+
+# Realistic browser fingerprint — Chrome on Windows
+DEFAULT_USER_AGENT = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) "
+    "Chrome/131.0.0.0 Safari/537.36"
+)
+DEFAULT_VIEWPORT = {"width": 1920, "height": 1080}
+DEFAULT_LOCALE = "en-US"
+DEFAULT_TIMEZONE = "America/Detroit"  # SE Michigan coverage
+
+
+class TargetScraper(BaseScraper):
+    """Scraper for Target Circle in-store purchase history.
+
+    Target's order history SPA loads purchase data from internal API
+    endpoints. This scraper authenticates via the web login flow,
+    captures session cookies, and uses those to hit the order history
+    API for in-store receipt data.
+    """
+
+    async def _create_stealth_context(
+        self, playwright_instance: Playwright, cookies: list[dict] | None = None
+    ) -> BrowserContext:
+        """Create a browser context with stealth settings for Target."""
+        browser = await playwright_instance.chromium.launch(
+            headless=settings.headless,
+            args=[
+                "--disable-blink-features=AutomationControlled",
+                "--no-sandbox",
+                "--disable-dev-shm-usage",
+            ],
+        )
+        context = await browser.new_context(
+            user_agent=DEFAULT_USER_AGENT,
+            viewport=DEFAULT_VIEWPORT,  # type: ignore[arg-type]
+            locale=DEFAULT_LOCALE,
+            timezone_id=DEFAULT_TIMEZONE,
+            java_script_enabled=True,
+            bypass_csp=False,
+            color_scheme="light",
+            has_touch=False,
+        )
+        # Mask webdriver and automation signals
+        await context.add_init_script(
+            """
+            Object.defineProperty(navigator, 'webdriver', {
+                get: () => undefined
+            });
+
+            window.chrome = {
+                runtime: {},
+                loadTimes: function() {},
+                csi: function() {},
+                app: { isInstalled: false }
+            };
+
+            Object.defineProperty(navigator, 'plugins', {
+                get: () => [1, 2, 3, 4, 5]
+            });
+
+            Object.defineProperty(navigator, 'languages', {
+                get: () => ['en-US', 'en']
+            });
+
+            Object.defineProperty(navigator, 'platform', {
+                get: () => 'Win32'
+            });
+
+            Object.defineProperty(navigator, 'hardwareConcurrency', {
+                get: () => 8
+            });
+
+            Object.defineProperty(navigator, 'deviceMemory', {
+                get: () => 8
+            });
+            """
+        )
+        if cookies:
+            await context.add_cookies(cookies)  # type: ignore[arg-type]
+        return cast(BrowserContext, context)
+
+    async def login(self, username: str, password: str) -> SessionData:
+        """Log in to Target and capture session cookies."""
+        async with async_playwright() as p:
+            context = await self._create_stealth_context(p)
+            page = await context.new_page()
+            try:
+                return await self._perform_login(page, context, username, password)
+            finally:
+                if context.browser:
+                    await context.browser.close()
+
+    async def _perform_login(
+        self, page: Page, context: BrowserContext, username: str, password: str
+    ) -> SessionData:
+        """Execute the Target login flow."""
+        logger.info("Navigating to Target sign-in page")
+        await page.goto(TARGET_LOGIN_PAGE, wait_until="networkidle")
+        await self.human_delay(2000, 4000)
+
+        # Target login form — email/username field
+        email_input = page.locator(
+            'input[id="username"], '
+            'input[name="username"], '
+            'input[type="email"], '
+            'input[data-test="username"]'
+        )
+        await email_input.wait_for(state="visible", timeout=settings.browser_timeout_ms)
+        await email_input.click()
+        await self.human_delay(300, 700)
+        await email_input.fill(username)
+        await self.human_delay(800, 1500)
+
+        # Password field
+        password_input = page.locator(
+            'input[id="password"], '
+            'input[name="password"], '
+            'input[type="password"], '
+            'input[data-test="password"]'
+        )
+        await password_input.wait_for(state="visible", timeout=settings.browser_timeout_ms)
+        await password_input.click()
+        await self.human_delay(300, 700)
+        await password_input.fill(password)
+        await self.human_delay(1000, 2000)
+
+        # Sign-in button
+        sign_in_btn = page.locator(
+            'button[id="login"], '
+            'button[data-test="login-button"], '
+            'button[type="submit"]:has-text("Sign in")'
+        )
+        await sign_in_btn.click()
+
+        # Wait for redirect away from login page
+        await page.wait_for_url(
+            lambda url: "login" not in url.lower(),
+            timeout=settings.browser_timeout_ms,
+        )
+        await self.human_delay(1500, 3000)
+
+        # Capture cookies
+        raw_cookies = await context.cookies()
+        cookies = [dict(c) for c in raw_cookies]
+        now = datetime.now(UTC)
+
+        logger.info("Target login successful, captured %d cookies", len(cookies))
+        return SessionData(
+            cookies=cookies,
+            user_agent=DEFAULT_USER_AGENT,
+            created_at=now,
+            expires_at=now + timedelta(hours=2),
+            extra={"retailer": "target"},
+        )
+
+    async def check_session(self, session: SessionData) -> bool:
+        """Check if the Target session is still valid."""
+        if session.expires_at and datetime.now(UTC) > session.expires_at:
+            logger.info("Target session expired based on timestamp")
+            return False
+
+        async with async_playwright() as p:
+            context = await self._create_stealth_context(p, cookies=session.cookies)
+            page = await context.new_page()
+            try:
+                response = await page.goto(TARGET_ACCOUNT_PAGE, wait_until="networkidle")
+                current_url = page.url.lower()
+                is_valid = "login" not in current_url and response is not None and response.ok
+                logger.info("Target session check: valid=%s (url=%s)", is_valid, page.url)
+                return is_valid
+            except Exception:
+                logger.exception("Target session check failed")
+                return False
+            finally:
+                if context.browser:
+                    await context.browser.close()
+
+    async def scrape_receipts(
+        self, session: SessionData, since: datetime | None = None
+    ) -> list[RawReceipt]:
+        """Scrape in-store purchase history from Target Circle."""
+        async with async_playwright() as p:
+            context = await self._create_stealth_context(p, cookies=session.cookies)
+            page = await context.new_page()
+            try:
+                return await self._fetch_receipts(page, since)
+            finally:
+                if context.browser:
+                    await context.browser.close()
+
+    async def _fetch_receipts(self, page: Page, since: datetime | None) -> list[RawReceipt]:
+        """Fetch receipt list and details from Target order history.
+
+        Target's order history page has separate tabs for online and in-store
+        purchases. We target the in-store tab which shows Circle-linked
+        transactions.
+        """
+        # Navigate to order history to establish context
+        await page.goto(TARGET_ORDER_HISTORY, wait_until="networkidle")
+        await self.human_delay(1500, 3000)
+
+        receipts: list[RawReceipt] = []
+
+        # Target order history API — filter for in-store purchases
+        api_response = await page.request.get(
+            TARGET_ORDER_API,
+            params={"channel": "in_store", "limit": "50"},
+        )
+        if not api_response.ok:
+            logger.warning(
+                "Target order history request failed: %d %s",
+                api_response.status,
+                api_response.status_text,
+            )
+            return []
+
+        response = await api_response.json()
+        if not isinstance(response, dict):
+            logger.warning("Unexpected order history response type: %s", type(response))
+            return []
+
+        # Target uses "orders" key for in-store purchase list
+        orders = response.get("orders", response.get("transactions", []))
+        if not isinstance(orders, list):
+            logger.warning("No orders found in Target order history response")
+            return []
+
+        logger.info("Found %d in-store orders in Target history", len(orders))
+
+        for order in orders:
+            raw_id = order.get("orderId") or order.get("transactionId") or order.get("id") or ""
+            order_id = str(raw_id)
+            purchase_date = order.get(
+                "purchaseDate",
+                order.get("transactionDate", order.get("date", "")),
+            )
+
+            # Filter by date if 'since' is provided
+            if since and purchase_date:
+                try:
+                    txn_dt = datetime.fromisoformat(purchase_date.replace("Z", "+00:00"))
+                    if txn_dt < since:
+                        continue
+                except (ValueError, TypeError):
+                    pass
+
+            if not order_id:
+                continue
+
+            await self.human_delay(1000, 2500)
+
+            # Fetch receipt detail
+            detail = await self._fetch_receipt_detail(page, order_id)
+
+            raw_store = (
+                order.get("storeNumber") or order.get("storeId") or order.get("locationId") or ""
+            )
+            store_number = str(raw_store)
+
+            receipts.append(
+                RawReceipt(
+                    receipt_id=order_id,
+                    purchase_date=purchase_date,
+                    store_number=store_number,
+                    raw_data={**order, "detail": detail},
+                    source_url=f"{TARGET_RECEIPT_API}/{order_id}",
+                )
+            )
+
+        logger.info("Scraped %d receipts from Target", len(receipts))
+        return receipts
+
+    async def _fetch_receipt_detail(self, page: Page, order_id: str) -> dict:
+        """Fetch detailed receipt data for a single Target order."""
+        try:
+            url = f"{TARGET_RECEIPT_API}/{order_id}"
+            api_response = await page.request.get(url)
+            if not api_response.ok:
+                logger.warning(
+                    "Target receipt detail request failed for %s: %d",
+                    order_id,
+                    api_response.status,
+                )
+                return {}
+            detail = await api_response.json()
+            return detail if isinstance(detail, dict) else {}
+        except Exception:
+            logger.exception("Failed to fetch Target receipt detail for %s", order_id)
+            return {}
+
+    def parse_receipt(self, raw: RawReceipt) -> dict:
+        """Parse raw Target receipt into structured purchase data."""
+        from receiptwitness.parsers.target import parse_target_receipt
+
+        return parse_target_receipt(raw)
diff --git a/src/receiptwitness/session/__init__.py b/src/receiptwitness/session/__init__.py
new file mode 100644
index 0000000..70beaef
--- /dev/null
+++ b/src/receiptwitness/session/__init__.py
@@ -0,0 +1 @@
+"""Session management — encrypted cookie storage and refresh logic."""
diff --git a/src/receiptwitness/session/encryption.py b/src/receiptwitness/session/encryption.py
new file mode 100644
index 0000000..b406bcf
--- /dev/null
+++ b/src/receiptwitness/session/encryption.py
@@ -0,0 +1,52 @@
+"""Fernet-based encryption for session cookies at rest.
+
+Session data (cookies, tokens) is encrypted before writing to the database
+and decrypted only when needed for a scrape. The encryption key is provided
+via the RW_SESSION_ENCRYPTION_KEY environment variable — it is never stored
+in the database or logged.
+"""
+
+import json
+import logging
+
+from cryptography.fernet import Fernet, InvalidToken
+
+from receiptwitness.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+def _get_fernet() -> Fernet:
+    """Get a Fernet instance using the configured encryption key."""
+    key = settings.session_encryption_key
+    if not key:
+        raise ValueError(
+            "RW_SESSION_ENCRYPTION_KEY is not set. "
+            "Generate one with: "
+            "python -c 'from cryptography.fernet import Fernet; "
+            "print(Fernet.generate_key().decode())'"
+        )
+    return Fernet(key.encode() if isinstance(key, str) else key)
+
+
+def encrypt_session_data(data: dict) -> str:
+    """Encrypt session data dict to a Fernet token string.
+
+    The data is JSON-serialized, then encrypted. The result is a
+    URL-safe base64-encoded string suitable for storing in JSONB.
+    """
+    f = _get_fernet()
+    plaintext = json.dumps(data, default=str).encode("utf-8")
+    return f.encrypt(plaintext).decode("utf-8")
+
+
+def decrypt_session_data(encrypted: str) -> dict:
+    """Decrypt a Fernet token string back to a session data dict."""
+    f = _get_fernet()
+    try:
+        plaintext = f.decrypt(encrypted.encode("utf-8"))
+        result: dict = json.loads(plaintext)
+        return result
+    except InvalidToken:
+        logger.error("Failed to decrypt session data — invalid token or wrong key")
+        raise
diff --git a/src/receiptwitness/session/manager.py b/src/receiptwitness/session/manager.py
new file mode 100644
index 0000000..205ccbd
--- /dev/null
+++ b/src/receiptwitness/session/manager.py
@@ -0,0 +1,81 @@
+"""Session storage, retrieval, and refresh logic.
+
+Manages the lifecycle of retailer session data:
+- Load encrypted session from DB
+- Check validity via scraper
+- Re-authenticate if expired
+- Save new session back (encrypted)
+"""
+
+import logging
+from dataclasses import asdict
+from datetime import UTC, datetime
+
+from receiptwitness.scrapers.base import BaseScraper, SessionData
+from receiptwitness.session.encryption import decrypt_session_data, encrypt_session_data
+
+logger = logging.getLogger(__name__)
+
+
+def session_from_db_record(session_data_encrypted: str | None) -> SessionData | None:
+    """Deserialize and decrypt a session from the database.
+
+    The session_data column in user_store_accounts stores the Fernet-encrypted
+    JSON of the SessionData fields.
+    """
+    if not session_data_encrypted:
+        return None
+
+    try:
+        data = decrypt_session_data(session_data_encrypted)
+        return SessionData(
+            cookies=data["cookies"],
+            user_agent=data["user_agent"],
+            created_at=datetime.fromisoformat(data["created_at"]),
+            expires_at=(
+                datetime.fromisoformat(data["expires_at"]) if data.get("expires_at") else None
+            ),
+            extra=data.get("extra", {}),
+        )
+    except Exception:
+        logger.exception("Failed to load session from DB record")
+        return None
+
+
+def session_to_db_value(session: SessionData) -> str:
+    """Serialize and encrypt a session for database storage."""
+    data = asdict(session)
+    # Convert datetime objects to ISO strings for JSON serialization
+    data["created_at"] = session.created_at.isoformat()
+    if session.expires_at:
+        data["expires_at"] = session.expires_at.isoformat()
+    return encrypt_session_data(data)
+
+
+async def get_valid_session(
+    scraper: BaseScraper,
+    session_data_encrypted: str | None,
+    username: str,
+    password: str,
+) -> tuple[SessionData, bool]:
+    """Get a valid session, re-authenticating if needed.
+
+    Returns:
+        A tuple of (session, was_refreshed). If was_refreshed is True,
+        the caller should persist the new session to the database.
+    """
+    # Try existing session first
+    existing = session_from_db_record(session_data_encrypted)
+    if existing:
+        if existing.expires_at and datetime.now(UTC) > existing.expires_at:
+            logger.info("Session expired by timestamp, re-authenticating")
+        elif await scraper.check_session(existing):
+            logger.info("Existing session is valid")
+            return existing, False
+        else:
+            logger.info("Session check failed, re-authenticating")
+
+    # Need to re-authenticate
+    logger.info("Performing fresh login")
+    new_session = await scraper.login(username, password)
+    return new_session, True