forked from cartsnitch/cartsnitch
Merge commit '342906c9d178923d462a08aec35e486703366eba' as 'receiptwitness'
This commit is contained in:
@@ -0,0 +1,12 @@
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.pytest_cache/
|
||||
*.egg-info/
|
||||
dist/
|
||||
.venv/
|
||||
.env
|
||||
.git/
|
||||
.github/
|
||||
tests/
|
||||
*.md
|
||||
renovate.json
|
||||
+168
@@ -0,0 +1,168 @@
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
concurrency:
|
||||
group: ci-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
packages: write
|
||||
|
||||
env:
|
||||
REGISTRY: ghcr.io
|
||||
IMAGE_NAME: cartsnitch/receiptwitness
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
runs-on: runners-cartsnitch
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.12"
|
||||
cache: pip
|
||||
- name: Install cartsnitch-common from GitHub
|
||||
run: pip install "cartsnitch-common @ git+https://github.com/cartsnitch/common.git@76685ed0384103228cd670b477b967e7752ebe6b"
|
||||
- run: pip install ruff
|
||||
- name: Ruff lint
|
||||
run: ruff check .
|
||||
- name: Ruff format check
|
||||
run: ruff format --check .
|
||||
|
||||
typecheck:
|
||||
runs-on: runners-cartsnitch
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.12"
|
||||
cache: pip
|
||||
- name: Install cartsnitch-common from GitHub
|
||||
run: pip install "cartsnitch-common @ git+https://github.com/cartsnitch/common.git@76685ed0384103228cd670b477b967e7752ebe6b"
|
||||
- run: pip install -e ".[dev]" mypy
|
||||
- name: Type check
|
||||
run: mypy src/receiptwitness
|
||||
|
||||
test:
|
||||
runs-on: runners-cartsnitch
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:15-alpine
|
||||
credentials:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
env:
|
||||
POSTGRES_USER: cartsnitch
|
||||
POSTGRES_PASSWORD: cartsnitch_test
|
||||
POSTGRES_DB: cartsnitch_test
|
||||
ports:
|
||||
- 5432:5432
|
||||
options: >-
|
||||
--health-cmd pg_isready
|
||||
--health-interval 10s
|
||||
--health-timeout 5s
|
||||
--health-retries 5
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
credentials:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
ports:
|
||||
- 6379:6379
|
||||
options: >-
|
||||
--health-cmd "redis-cli ping"
|
||||
--health-interval 10s
|
||||
--health-timeout 5s
|
||||
--health-retries 5
|
||||
env:
|
||||
DATABASE_URL: postgresql://cartsnitch:cartsnitch_test@localhost:5432/cartsnitch_test
|
||||
REDIS_URL: redis://localhost:6379/0
|
||||
ENCRYPTION_KEY: dGVzdC1lbmNyeXB0aW9uLWtleS0xMjM0NTY3ODk=
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.12"
|
||||
cache: pip
|
||||
- name: Install cartsnitch-common from GitHub
|
||||
run: pip install "cartsnitch-common @ git+https://github.com/cartsnitch/common.git@76685ed0384103228cd670b477b967e7752ebe6b"
|
||||
- run: pip install -e ".[dev]"
|
||||
- name: Install Playwright browsers
|
||||
run: playwright install chromium --with-deps
|
||||
- name: Run tests
|
||||
run: pytest --tb=short -q
|
||||
|
||||
build-and-push:
|
||||
runs-on: runners-cartsnitch
|
||||
needs: [lint, test]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Generate CalVer tag
|
||||
id: calver
|
||||
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
||||
run: |
|
||||
DATE_TAG=$(date -u +%Y.%m.%d)
|
||||
EXISTING=$(git tag -l "v${DATE_TAG}*" | sort -V | tail -1)
|
||||
if [ -z "$EXISTING" ]; then
|
||||
VERSION="$DATE_TAG"
|
||||
elif [ "$EXISTING" = "v${DATE_TAG}" ]; then
|
||||
VERSION="${DATE_TAG}.2"
|
||||
else
|
||||
BUILD_NUM=$(echo "$EXISTING" | sed "s/v${DATE_TAG}\.//")
|
||||
VERSION="${DATE_TAG}.$((BUILD_NUM + 1))"
|
||||
fi
|
||||
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
|
||||
echo "CalVer tag: $VERSION"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Log in to GHCR
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Extract metadata
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||
tags: |
|
||||
type=sha,prefix=sha-
|
||||
type=raw,value=${{ steps.calver.outputs.version }},enable=${{ github.ref == 'refs/heads/main' }}
|
||||
type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }}
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
push: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
target: prod
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
- name: Create git tag
|
||||
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
||||
run: |
|
||||
git tag "v${{ steps.calver.outputs.version }}"
|
||||
git push origin "v${{ steps.calver.outputs.version }}"
|
||||
@@ -0,0 +1,7 @@
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.pytest_cache/
|
||||
*.egg-info/
|
||||
dist/
|
||||
.venv/
|
||||
.env
|
||||
@@ -0,0 +1,227 @@
|
||||
# ReceiptWitness — CartSnitch Receipt Ingestion Service
|
||||
|
||||
## Project Context
|
||||
|
||||
CartSnitch is a self-hosted grocery price intelligence platform built as a polyrepo microservices architecture. This repo (`cartsnitch/receiptwitness`) is the receipt/purchase history ingestion service.
|
||||
|
||||
**GitHub org:** github.com/cartsnitch
|
||||
**Domain:** cartsnitch.com
|
||||
|
||||
### CartSnitch Services
|
||||
|
||||
| Repo | Service | Purpose |
|
||||
|------|---------|---------|
|
||||
| `cartsnitch/common` | — | Shared models, schemas, utilities |
|
||||
| `cartsnitch/receiptwitness` | ReceiptWitness | Purchase data ingestion via retailer scrapers (this repo) |
|
||||
| `cartsnitch/api` | API Gateway | Frontend-facing REST API |
|
||||
| `cartsnitch/cartsnitch` | Frontend | React PWA (mobile-first) |
|
||||
| `cartsnitch/stickershock` | StickerShock | Price increase detection & CPI comparison |
|
||||
| `cartsnitch/shrinkray` | ShrinkRay | Shrinkflation monitoring |
|
||||
| `cartsnitch/clipartist` | ClipArtist | Coupon/deal watching & shopping optimization |
|
||||
| `cartsnitch/infra` | — | K8s manifests, Flux kustomizations |
|
||||
|
||||
### Architecture Decisions
|
||||
|
||||
- **Polyrepo:** Each service has its own repo, Dockerfile, CI/CD pipeline.
|
||||
- **Shared DB:** One PostgreSQL cluster. This service writes to `purchases`, `purchase_items`, `price_history` tables. Models come from `cartsnitch-common`.
|
||||
- **Inter-service comms:** REST (synchronous) + Redis pub/sub (async events).
|
||||
- **Target scale:** 500–1,000 users. Each user has their own authenticated sessions to up to 3 retailers.
|
||||
|
||||
## What This Service Does
|
||||
|
||||
ReceiptWitness authenticates with grocery retailer web portals using per-user sessions, scrapes purchase history / receipt data, parses it into structured records, and writes it to the shared database. After ingestion, it publishes a `cartsnitch.receipts.ingested` event so downstream services (StickerShock, ClipArtist) can react.
|
||||
|
||||
### Target Retailers (MVP)
|
||||
|
||||
#### Meijer (mPerks)
|
||||
- **Auth:** No public API. Session cookie-based auth on mperks.meijer.com.
|
||||
- **Receipt location:** meijer.com/mperks/receipts-savings.html (or underlying XHR endpoints)
|
||||
- **Approach:** Playwright login → capture session → hit receipt XHR endpoints directly. Map the API calls the frontend makes via browser dev tools network tab.
|
||||
- **Prior art:** `dapperfu/python_Meijer` (requires MITM proxy for auth — avoid this pattern, prefer direct browser automation).
|
||||
- **Data available:** Digital receipts appear ~15 minutes after purchase if mPerks ID was used at checkout. Includes item names, prices, discounts, savings.
|
||||
|
||||
#### Kroger
|
||||
- **Auth:** No public API for purchase history (that's behind Partner API). Session cookie-based auth on kroger.com.
|
||||
- **Receipt location:** kroger.com/mypurchases
|
||||
- **Approach:** Playwright login → scrape purchase history pages or intercept XHR endpoints.
|
||||
- **Anti-bot:** Kroger uses Akamai Bot Manager. Aggressive headless browser detection. Need Playwright stealth, realistic fingerprinting, human-like interaction pacing.
|
||||
- **Prior art:** `phyllis-vance/KrogerScrape` (.NET, old), `callaginn/kroger-sweeper` (Puppeteer/Node), `ThermoMan/Get-Kroger-Grocery-List` (Greasemonkey userscript).
|
||||
- **Kroger public API:** Free developer account at developer.kroger.com provides product catalog data (`product.compact` scope) — useful for enriching scraped receipt data with UPCs, categories, product images. NOT useful for purchase history.
|
||||
- **Data available:** Purchase history tied to Kroger Plus loyalty card. Shows items, prices, quantities.
|
||||
|
||||
#### Target (Circle)
|
||||
- **Auth:** Session-based auth on target.com.
|
||||
- **Receipt location:** target.com account → Orders → In-store tab, or target.com/account/orders
|
||||
- **Approach:** Playwright login → scrape in-store purchase history.
|
||||
- **Data available:** ~1 year of history if user paid with a linked card, used the Target app wallet, or entered their Target Circle phone number at checkout. Includes item names, prices.
|
||||
|
||||
## Tech Stack
|
||||
|
||||
- Python 3.12+
|
||||
- Playwright (Python async API) for headless browser automation
|
||||
- FastAPI (lightweight internal API for triggering scrapes, health checks, status)
|
||||
- SQLAlchemy 2.0 (via `cartsnitch-common`)
|
||||
- Redis (pub/sub event publishing)
|
||||
- APScheduler or Celery (for scheduled scraping jobs)
|
||||
- cryptography / Fernet (encrypting stored session data)
|
||||
|
||||
## Repo Structure
|
||||
|
||||
```
|
||||
receiptwitness/
|
||||
├── CLAUDE.md
|
||||
├── README.md
|
||||
├── pyproject.toml
|
||||
├── Dockerfile # Playwright + Chromium headless
|
||||
├── docker-compose.yml # Local dev (Postgres, Redis, this service)
|
||||
├── src/
|
||||
│ └── receiptwitness/
|
||||
│ ├── __init__.py
|
||||
│ ├── config.py # Service-specific settings
|
||||
│ ├── main.py # FastAPI app + scheduler bootstrap
|
||||
│ ├── scrapers/
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── base.py # Abstract BaseScraper class
|
||||
│ │ ├── meijer.py # Meijer/mPerks scraper
|
||||
│ │ ├── kroger.py # Kroger scraper
|
||||
│ │ └── target.py # Target/Circle scraper
|
||||
│ ├── parsers/
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── meijer.py # Parse raw Meijer receipt data → PurchaseItem records
|
||||
│ │ ├── kroger.py
|
||||
│ │ └── target.py
|
||||
│ ├── session/
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── manager.py # Session storage, retrieval, refresh logic
|
||||
│ │ └── encryption.py # Encrypt/decrypt session cookies at rest
|
||||
│ ├── scheduler.py # Scrape scheduling (per-user cron jobs)
|
||||
│ ├── events.py # Publish receipt.ingested events to Redis
|
||||
│ ├── api/
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── routes.py # Internal API: trigger scrape, check status, health
|
||||
│ │ └── auth.py # Internal service auth (API key or JWT)
|
||||
│ └── enrichment.py # Optional: enrich receipt data via Kroger public API
|
||||
└── tests/
|
||||
├── conftest.py
|
||||
├── fixtures/ # Sample receipt HTML/JSON for testing parsers
|
||||
│ ├── meijer_receipt.json
|
||||
│ ├── kroger_receipt.html
|
||||
│ └── target_receipt.html
|
||||
├── test_scrapers/
|
||||
├── test_parsers/
|
||||
└── test_session/
|
||||
```
|
||||
|
||||
## Scraper Architecture
|
||||
|
||||
### Base Scraper Pattern
|
||||
|
||||
```python
|
||||
class BaseScraper(ABC):
|
||||
"""All retailer scrapers implement this interface."""
|
||||
|
||||
@abstractmethod
|
||||
async def login(self, credentials: UserStoreAccount) -> SessionData: ...
|
||||
|
||||
@abstractmethod
|
||||
async def check_session(self, session: SessionData) -> bool: ...
|
||||
|
||||
@abstractmethod
|
||||
async def scrape_receipts(self, session: SessionData, since: datetime | None) -> list[RawReceipt]: ...
|
||||
|
||||
@abstractmethod
|
||||
def parse_receipt(self, raw: RawReceipt) -> tuple[Purchase, list[PurchaseItem]]: ...
|
||||
```
|
||||
|
||||
### Scraping Flow
|
||||
|
||||
1. **Scheduler fires** for a user+store combination
|
||||
2. **Load session** from `user_store_accounts` table (encrypted)
|
||||
3. **Check session validity** — quick lightweight request to verify auth
|
||||
4. **If expired:** launch Playwright, re-authenticate, save new session
|
||||
5. **Scrape receipts** since `last_sync_at` timestamp
|
||||
6. **Parse** raw data into `Purchase` and `PurchaseItem` records
|
||||
7. **Deduplicate** — skip receipts already in DB (match on `receipt_id` per store)
|
||||
8. **Write to DB** — insert new purchases and items
|
||||
9. **Derive price_history** entries from purchase_items
|
||||
10. **Publish event** — `cartsnitch.receipts.ingested` to Redis
|
||||
11. **Update** `user_store_accounts.last_sync_at`
|
||||
|
||||
### Session Management
|
||||
|
||||
- Sessions (cookies, tokens) are encrypted at rest using Fernet symmetric encryption.
|
||||
- The encryption key is provided via environment variable, not stored in the DB.
|
||||
- Sessions are stored in the `user_store_accounts` table as encrypted JSONB.
|
||||
- Each scrape attempt first checks if the existing session is valid before launching a full Playwright browser instance.
|
||||
- When a session expires, the service needs the user's stored credentials OR a manual re-auth flow (the user logs in via the frontend, and we capture the session).
|
||||
|
||||
### Anti-Bot Considerations
|
||||
|
||||
- Use `playwright-stealth` or equivalent to mask automation signals.
|
||||
- Set realistic viewport sizes, user agents, and locale settings.
|
||||
- Add human-like delays between page navigations (randomized 1-5 seconds).
|
||||
- For Kroger specifically (Akamai Bot Manager): may need to use non-headless mode on initial auth, or route through a persistent browser profile that has established trust.
|
||||
- Rate limit scraping: no more than 1 scrape per user per store per hour. Default cadence: once daily.
|
||||
- Store and reuse browser profiles/cookies to minimize fresh logins.
|
||||
|
||||
### Dockerfile
|
||||
|
||||
The Dockerfile must include Playwright and Chromium. Base image pattern:
|
||||
|
||||
```dockerfile
|
||||
FROM mcr.microsoft.com/playwright/python:v1.49.0-noble
|
||||
# Install deps, copy code, etc.
|
||||
```
|
||||
|
||||
This is a large image (~2GB) due to Chromium. Consider multi-stage builds if the final image can be slimmed down.
|
||||
|
||||
## Internal API Endpoints
|
||||
|
||||
This service exposes a lightweight internal API (not public-facing):
|
||||
|
||||
- `GET /health` — health check
|
||||
- `GET /status/{user_id}` — sync status per store for a user
|
||||
- `POST /scrape/{user_id}/{store_slug}` — trigger an immediate scrape for a user+store
|
||||
- `POST /scrape/{user_id}/all` — trigger scrape across all configured stores
|
||||
- `GET /sessions/{user_id}` — list configured store sessions and their status
|
||||
|
||||
The public-facing API gateway (`cartsnitch/api`) proxies user-facing requests to this service's internal API.
|
||||
|
||||
## Events Published
|
||||
|
||||
### `cartsnitch.receipts.ingested`
|
||||
|
||||
Published after new receipt data is successfully written to the DB.
|
||||
|
||||
```json
|
||||
{
|
||||
"event_type": "cartsnitch.receipts.ingested",
|
||||
"timestamp": "2026-03-15T12:00:00Z",
|
||||
"service": "receiptwitness",
|
||||
"payload": {
|
||||
"user_id": "uuid",
|
||||
"store_slug": "meijer",
|
||||
"purchase_id": "uuid",
|
||||
"purchase_date": "2026-03-14",
|
||||
"item_count": 23,
|
||||
"total": 87.42
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Development Workflow
|
||||
|
||||
- **Never push directly to main.** Always create feature branches and open PRs.
|
||||
- Branch naming: `feature/<store>/<description>` or `fix/<description>`
|
||||
- Use conventional commits: `feat:`, `fix:`, `refactor:`, `docs:`, `chore:`
|
||||
- Test parsers with fixture data (sample receipts in `tests/fixtures/`). Scraper integration tests require real credentials and should be tagged/skipped in CI.
|
||||
- Local dev: `docker-compose up` starts Postgres, Redis, and the service. Playwright runs inside the container.
|
||||
|
||||
## Important Notes
|
||||
|
||||
- The Playwright container image is large. On K8s, consider using a dedicated node or tolerating scheduling delays.
|
||||
- Each user needs their own authenticated sessions. At 1,000 users × 3 stores = 3,000 sessions to manage. Sessions expire at different rates per retailer.
|
||||
- Scraping must be respectful: randomized intervals, rate limiting, no parallel scraping of the same store for the same user.
|
||||
- Receipt data structure varies significantly between retailers. The parsers must be robust and handle edge cases (returns, voided items, weighted produce, BOGO items, coupon stacking).
|
||||
- Kroger's public API (`product.compact` scope) can be used to enrich scraped data with UPCs and product metadata after receipt parsing. This is optional but improves product normalization downstream.
|
||||
- Store credentials for users should ideally NOT be stored by CartSnitch. Prefer a flow where the user authenticates in a controlled browser session, and we capture/store only the resulting session cookies. If credential storage is necessary, use strong encryption and make the tradeoffs clear to users.
|
||||
@@ -0,0 +1,67 @@
|
||||
# Stage 1: Build dependencies
|
||||
FROM python:3.12-slim AS build
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# git is required to install cartsnitch-common from GitHub; build-essential and
|
||||
# libpq-dev are needed to compile any C-extension wheels (e.g. psycopg2 fallback)
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
git \
|
||||
libpq-dev \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY pyproject.toml ./
|
||||
COPY src/ ./src/
|
||||
|
||||
# cartsnitch-common is not on PyPI — install it directly from GitHub, then
|
||||
# install the rest of the package dependencies in a single resolver pass so
|
||||
# pip can satisfy the cartsnitch-common>=0.1.0 constraint declared in
|
||||
# pyproject.toml without hitting PyPI for it.
|
||||
RUN pip install --no-cache-dir --prefix=/install \
|
||||
"cartsnitch-common @ git+https://github.com/cartsnitch/common.git@76685ed0384103228cd670b477b967e7752ebe6b" \
|
||||
.
|
||||
|
||||
# Stage 2: Production image with Playwright + Chromium
|
||||
FROM python:3.12-slim AS prod
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install Playwright system dependencies for Chromium
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libnss3 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
libcups2 \
|
||||
libdrm2 \
|
||||
libxkbcommon0 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxrandr2 \
|
||||
libgbm1 \
|
||||
libpango-1.0-0 \
|
||||
libcairo2 \
|
||||
libasound2 \
|
||||
libxshmfence1 \
|
||||
libx11-xcb1 \
|
||||
libxcb-dri3-0 \
|
||||
fonts-liberation \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN adduser --system --group --uid 1000 app
|
||||
|
||||
COPY --from=build /install /usr/local
|
||||
COPY src/ ./src/
|
||||
|
||||
# Install Playwright Chromium browser (runs as root; /opt/playwright is world-readable)
|
||||
RUN PLAYWRIGHT_BROWSERS_PATH=/opt/playwright playwright install chromium
|
||||
|
||||
ENV PLAYWRIGHT_BROWSERS_PATH=/opt/playwright
|
||||
|
||||
USER 1000
|
||||
EXPOSE 8000
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=3s \
|
||||
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"
|
||||
|
||||
CMD ["uvicorn", "receiptwitness.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
@@ -0,0 +1,54 @@
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "receiptwitness"
|
||||
version = "0.1.0"
|
||||
description = "CartSnitch receipt/purchase history ingestion service"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"cartsnitch-common>=0.1.0",
|
||||
"playwright>=1.49,<2.0",
|
||||
"playwright-stealth>=1.0,<2.0",
|
||||
"cryptography>=42.0,<44.0",
|
||||
"fastapi>=0.115,<1.0",
|
||||
"uvicorn[standard]>=0.30,<1.0",
|
||||
"redis>=5.0,<6.0",
|
||||
"pydantic>=2.0,<3.0",
|
||||
"pydantic-settings>=2.0,<3.0",
|
||||
"sqlalchemy[asyncio]>=2.0,<3.0",
|
||||
"asyncpg>=0.29,<1.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest>=8.0",
|
||||
"pytest-asyncio>=0.23",
|
||||
"ruff>=0.3",
|
||||
"pytest-cov>=5.0",
|
||||
]
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/receiptwitness"]
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py312"
|
||||
line-length = 100
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = ["E", "F", "I", "N", "W", "UP"]
|
||||
|
||||
[tool.mypy]
|
||||
python_version = "3.12"
|
||||
strict = false
|
||||
warn_return_any = true
|
||||
warn_unused_ignores = true
|
||||
|
||||
[[tool.mypy.overrides]]
|
||||
module = "cartsnitch_common.*"
|
||||
ignore_missing_imports = true
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
asyncio_mode = "auto"
|
||||
testpaths = ["tests"]
|
||||
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"$schema": "https://docs.renovatebot.com/renovate-schema.json",
|
||||
"extends": ["local>cartsnitch/.github:renovate-config"]
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
"""ReceiptWitness — CartSnitch receipt ingestion service."""
|
||||
@@ -0,0 +1 @@
|
||||
"""Internal API for ReceiptWitness service."""
|
||||
@@ -0,0 +1,10 @@
|
||||
"""Internal API routes for triggering scrapes and checking status."""
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok", "service": "receiptwitness"}
|
||||
@@ -0,0 +1,26 @@
|
||||
"""Service-specific configuration for ReceiptWitness."""
|
||||
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
|
||||
class ReceiptWitnessSettings(BaseSettings):
|
||||
model_config = {"env_prefix": "RW_"}
|
||||
|
||||
# Inherited from cartsnitch-common
|
||||
database_url: str = "postgresql+asyncpg://cartsnitch:cartsnitch@localhost:5432/cartsnitch"
|
||||
redis_url: str = "redis://localhost:6379/0"
|
||||
|
||||
# Session encryption
|
||||
session_encryption_key: str = ""
|
||||
|
||||
# Scraping defaults
|
||||
scrape_interval_seconds: int = 86400 # 24 hours
|
||||
min_request_delay_ms: int = 1000
|
||||
max_request_delay_ms: int = 5000
|
||||
|
||||
# Playwright
|
||||
headless: bool = True
|
||||
browser_timeout_ms: int = 60000
|
||||
|
||||
|
||||
settings = ReceiptWitnessSettings()
|
||||
@@ -0,0 +1,75 @@
|
||||
"""Publish receipt ingestion events to Redis/DragonflyDB pub/sub."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from datetime import UTC, datetime
|
||||
from decimal import Decimal
|
||||
|
||||
import redis.asyncio as aioredis
|
||||
|
||||
from receiptwitness.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CHANNEL_RECEIPTS_INGESTED = "cartsnitch.receipts.ingested"
|
||||
|
||||
# Module-level connection pool — shared across all publish calls
|
||||
_pool: aioredis.ConnectionPool | None = None
|
||||
|
||||
|
||||
class _DecimalEncoder(json.JSONEncoder):
|
||||
def default(self, o):
|
||||
if isinstance(o, Decimal):
|
||||
return float(o)
|
||||
return super().default(o)
|
||||
|
||||
|
||||
def _get_pool() -> aioredis.ConnectionPool:
|
||||
"""Get or create the shared Redis connection pool."""
|
||||
global _pool
|
||||
if _pool is None:
|
||||
_pool = aioredis.ConnectionPool.from_url(
|
||||
settings.redis_url, decode_responses=True, max_connections=10
|
||||
)
|
||||
return _pool
|
||||
|
||||
|
||||
async def get_redis_client() -> aioredis.Redis:
|
||||
"""Create an async Redis/DragonflyDB client with connection pooling."""
|
||||
return aioredis.Redis(connection_pool=_get_pool())
|
||||
|
||||
|
||||
async def publish_receipt_ingested(
|
||||
user_id: str,
|
||||
store_slug: str,
|
||||
purchase_id: str,
|
||||
purchase_date: str,
|
||||
item_count: int,
|
||||
total: Decimal | float,
|
||||
) -> None:
|
||||
"""Publish a cartsnitch.receipts.ingested event after successful ingestion."""
|
||||
event = {
|
||||
"event_type": CHANNEL_RECEIPTS_INGESTED,
|
||||
"timestamp": datetime.now(UTC).isoformat(),
|
||||
"service": "receiptwitness",
|
||||
"payload": {
|
||||
"user_id": user_id,
|
||||
"store_slug": store_slug,
|
||||
"purchase_id": purchase_id,
|
||||
"purchase_date": purchase_date,
|
||||
"item_count": item_count,
|
||||
"total": float(total) if isinstance(total, Decimal) else total,
|
||||
},
|
||||
}
|
||||
|
||||
try:
|
||||
client = await get_redis_client()
|
||||
await client.publish(CHANNEL_RECEIPTS_INGESTED, json.dumps(event, cls=_DecimalEncoder))
|
||||
logger.info(
|
||||
"Published %s event for purchase %s",
|
||||
CHANNEL_RECEIPTS_INGESTED,
|
||||
purchase_id,
|
||||
)
|
||||
except aioredis.ConnectionError:
|
||||
logger.error("Failed to publish event — Redis/DragonflyDB connection error")
|
||||
raise
|
||||
@@ -0,0 +1,8 @@
|
||||
"""FastAPI app entrypoint for ReceiptWitness."""
|
||||
|
||||
from fastapi import FastAPI
|
||||
|
||||
from receiptwitness.api.routes import router
|
||||
|
||||
app = FastAPI(title="ReceiptWitness", version="0.1.0")
|
||||
app.include_router(router)
|
||||
@@ -0,0 +1 @@
|
||||
"""Receipt parsers for each retailer."""
|
||||
@@ -0,0 +1,148 @@
|
||||
"""Kroger receipt parser.
|
||||
|
||||
Transforms raw Kroger receipt JSON into the common PurchaseCreate schema.
|
||||
Kroger receipt data uses different field names than Meijer — this parser
|
||||
handles Kroger-specific naming conventions and receipt structure.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from decimal import Decimal, InvalidOperation
|
||||
|
||||
from receiptwitness.scrapers.base import RawReceipt
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _to_decimal(value, default: str = "0") -> Decimal:
|
||||
"""Safely convert a value to Decimal."""
|
||||
if value is None:
|
||||
return Decimal(default)
|
||||
try:
|
||||
return Decimal(str(value))
|
||||
except (InvalidOperation, ValueError, TypeError):
|
||||
return Decimal(default)
|
||||
|
||||
|
||||
def _parse_item(item: dict) -> dict:
|
||||
"""Parse a single line item from a Kroger receipt.
|
||||
|
||||
Kroger items typically include fields like:
|
||||
- description / itemDescription / productName
|
||||
- upc / krogerProductId
|
||||
- quantity / qty
|
||||
- basePrice / unitPrice / price
|
||||
- totalPrice / extendedAmount / lineTotal
|
||||
- regularPrice / originalPrice
|
||||
- salePrice / promoPrice
|
||||
- couponAmount / couponSavings
|
||||
- loyaltyDiscount / fuelPointsDiscount / plusCardSavings
|
||||
- department / category / aisle
|
||||
"""
|
||||
description = (
|
||||
item.get("description")
|
||||
or item.get("itemDescription")
|
||||
or item.get("productName")
|
||||
or item.get("name")
|
||||
or "UNKNOWN ITEM"
|
||||
)
|
||||
|
||||
quantity = _to_decimal(item.get("quantity", item.get("qty", item.get("quantitySold", 1))), "1")
|
||||
unit_price = _to_decimal(item.get("basePrice", item.get("unitPrice", item.get("price", 0))))
|
||||
extended_price = _to_decimal(
|
||||
item.get("totalPrice", item.get("extendedAmount", item.get("lineTotal")))
|
||||
)
|
||||
|
||||
# Compute extended_price if not provided
|
||||
if extended_price == Decimal("0") and unit_price != Decimal("0"):
|
||||
extended_price = unit_price * quantity
|
||||
|
||||
regular_price = item.get("regularPrice", item.get("originalPrice"))
|
||||
sale_price = item.get("salePrice", item.get("promoPrice"))
|
||||
coupon_discount = item.get(
|
||||
"couponAmount", item.get("couponSavings", item.get("couponDiscount"))
|
||||
)
|
||||
loyalty_discount = item.get(
|
||||
"plusCardSavings",
|
||||
item.get("loyaltyDiscount", item.get("fuelPointsDiscount")),
|
||||
)
|
||||
|
||||
# UPC handling — Kroger may use krogerProductId or upc
|
||||
upc = item.get("upc", item.get("UPC", item.get("krogerProductId")))
|
||||
if upc:
|
||||
upc = str(upc).strip().lstrip("0") or None
|
||||
|
||||
category = item.get("department", item.get("category", item.get("aisle")))
|
||||
|
||||
# Weight info for produce/deli items
|
||||
weight = item.get("weight", item.get("netWeight"))
|
||||
extra = {}
|
||||
if weight is not None:
|
||||
extra["weight"] = str(weight)
|
||||
weight_uom = item.get("weightUom", item.get("unitOfMeasure"))
|
||||
if weight_uom:
|
||||
extra["weight_uom"] = weight_uom
|
||||
|
||||
result = {
|
||||
"product_name_raw": description.strip(),
|
||||
"upc": upc,
|
||||
"quantity": quantity,
|
||||
"unit_price": unit_price,
|
||||
"extended_price": extended_price,
|
||||
"regular_price": (_to_decimal(regular_price) if regular_price is not None else None),
|
||||
"sale_price": (_to_decimal(sale_price) if sale_price is not None else None),
|
||||
"coupon_discount": (_to_decimal(coupon_discount) if coupon_discount is not None else None),
|
||||
"loyalty_discount": (
|
||||
_to_decimal(loyalty_discount) if loyalty_discount is not None else None
|
||||
),
|
||||
"category_raw": category.strip() if category else None,
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def parse_kroger_receipt(raw: RawReceipt) -> dict:
|
||||
"""Parse a RawReceipt from Kroger into a PurchaseCreate-compatible dict."""
|
||||
data = raw.raw_data
|
||||
detail = data.get("detail", {})
|
||||
|
||||
# Parse items — Kroger uses "items" or "lineItems" or "receiptItems"
|
||||
raw_items = detail.get("items", detail.get("lineItems", detail.get("receiptItems", [])))
|
||||
items = []
|
||||
for raw_item in raw_items:
|
||||
# Skip voided / returned items
|
||||
if raw_item.get("voided") or raw_item.get("status") in (
|
||||
"VOIDED",
|
||||
"RETURNED",
|
||||
):
|
||||
logger.debug("Skipping voided/returned item: %s", raw_item.get("description"))
|
||||
continue
|
||||
if raw_item.get("returnFlag") or raw_item.get("isReturn"):
|
||||
logger.debug("Skipping returned item: %s", raw_item.get("description"))
|
||||
continue
|
||||
items.append(_parse_item(raw_item))
|
||||
|
||||
# Parse totals — Kroger uses various field names
|
||||
total = _to_decimal(
|
||||
detail.get(
|
||||
"total",
|
||||
data.get("total", data.get("orderTotal", data.get("grandTotal", 0))),
|
||||
)
|
||||
)
|
||||
subtotal = detail.get("subtotal", data.get("subtotal", data.get("subTotal")))
|
||||
tax = detail.get("tax", data.get("tax", data.get("salesTax")))
|
||||
savings = detail.get(
|
||||
"totalSavings",
|
||||
data.get("savings", data.get("totalDiscount", data.get("youSaved"))),
|
||||
)
|
||||
|
||||
return {
|
||||
"receipt_id": raw.receipt_id,
|
||||
"purchase_date": raw.purchase_date,
|
||||
"total": total,
|
||||
"subtotal": _to_decimal(subtotal) if subtotal is not None else None,
|
||||
"tax": _to_decimal(tax) if tax is not None else None,
|
||||
"savings_total": _to_decimal(savings) if savings is not None else None,
|
||||
"source_url": raw.source_url,
|
||||
"raw_data": data,
|
||||
"items": items,
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
"""Parse raw Meijer mPerks receipt data into PurchaseCreate-compatible dicts.
|
||||
|
||||
The mPerks receipt JSON structure (reverse-engineered from their SPA)
|
||||
typically looks like:
|
||||
|
||||
Transaction listing:
|
||||
{
|
||||
"transactions": [
|
||||
{
|
||||
"transactionId": "12345",
|
||||
"transactionDate": "2026-03-10T14:30:00Z",
|
||||
"storeNumber": "123",
|
||||
"total": 87.42,
|
||||
"savings": 12.50
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Receipt detail:
|
||||
{
|
||||
"receiptId": "12345",
|
||||
"items": [
|
||||
{
|
||||
"description": "ORGANIC BANANAS",
|
||||
"upc": "0000000004011",
|
||||
"quantity": 1,
|
||||
"price": 0.69,
|
||||
"extendedPrice": 0.69,
|
||||
"regularPrice": 0.79,
|
||||
"salePrice": 0.69,
|
||||
"couponDiscount": 0.0,
|
||||
"mperksDiscount": 0.10,
|
||||
"category": "PRODUCE"
|
||||
}
|
||||
],
|
||||
"subtotal": 74.92,
|
||||
"tax": 5.24,
|
||||
"total": 87.42,
|
||||
"totalSavings": 12.50
|
||||
}
|
||||
"""
|
||||
|
||||
import logging
|
||||
from decimal import Decimal, InvalidOperation
|
||||
|
||||
from receiptwitness.scrapers.base import RawReceipt
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _to_decimal(value, default: str = "0") -> Decimal:
|
||||
"""Safely convert a value to Decimal."""
|
||||
if value is None:
|
||||
return Decimal(default)
|
||||
try:
|
||||
return Decimal(str(value))
|
||||
except (InvalidOperation, ValueError, TypeError):
|
||||
return Decimal(default)
|
||||
|
||||
|
||||
def _parse_item(item: dict) -> dict:
|
||||
"""Parse a single line item from Meijer receipt detail."""
|
||||
description = (
|
||||
item.get("description") or item.get("itemDescription") or item.get("name") or "UNKNOWN ITEM"
|
||||
)
|
||||
|
||||
quantity = _to_decimal(item.get("quantity", item.get("qty", 1)), "1")
|
||||
unit_price = _to_decimal(item.get("price", item.get("unitPrice", 0)))
|
||||
extended_price = _to_decimal(item.get("extendedPrice", item.get("totalPrice")))
|
||||
|
||||
# If extended_price wasn't provided, compute it
|
||||
if extended_price == Decimal("0") and unit_price != Decimal("0"):
|
||||
extended_price = unit_price * quantity
|
||||
|
||||
regular_price = item.get("regularPrice")
|
||||
sale_price = item.get("salePrice")
|
||||
coupon_discount = item.get("couponDiscount", item.get("couponSavings"))
|
||||
loyalty_discount = item.get("mperksDiscount", item.get("loyaltyDiscount"))
|
||||
|
||||
upc = item.get("upc", item.get("UPC"))
|
||||
if upc:
|
||||
upc = str(upc).strip().lstrip("0") or None
|
||||
|
||||
category = item.get("category", item.get("departmentDescription"))
|
||||
|
||||
return {
|
||||
"product_name_raw": description.strip(),
|
||||
"upc": upc,
|
||||
"quantity": quantity,
|
||||
"unit_price": unit_price,
|
||||
"extended_price": extended_price,
|
||||
"regular_price": _to_decimal(regular_price) if regular_price is not None else None,
|
||||
"sale_price": _to_decimal(sale_price) if sale_price is not None else None,
|
||||
"coupon_discount": (_to_decimal(coupon_discount) if coupon_discount is not None else None),
|
||||
"loyalty_discount": (
|
||||
_to_decimal(loyalty_discount) if loyalty_discount is not None else None
|
||||
),
|
||||
"category_raw": category.strip() if category else None,
|
||||
}
|
||||
|
||||
|
||||
def parse_meijer_receipt(raw: RawReceipt) -> dict:
|
||||
"""Parse a RawReceipt from Meijer into a PurchaseCreate-compatible dict.
|
||||
|
||||
Returns a dict with keys matching PurchaseCreate schema fields.
|
||||
The caller is responsible for setting store_id and store_location_id
|
||||
from the store registry.
|
||||
"""
|
||||
data = raw.raw_data
|
||||
detail = data.get("detail", {})
|
||||
|
||||
# Parse items from the detail response
|
||||
raw_items = detail.get("items", detail.get("lineItems", []))
|
||||
items = []
|
||||
for raw_item in raw_items:
|
||||
# Skip voided items
|
||||
if raw_item.get("voided") or raw_item.get("status") == "VOIDED":
|
||||
logger.debug("Skipping voided item: %s", raw_item.get("description"))
|
||||
continue
|
||||
items.append(_parse_item(raw_item))
|
||||
|
||||
# Parse totals
|
||||
total = _to_decimal(detail.get("total", data.get("total", data.get("transactionTotal", 0))))
|
||||
subtotal = detail.get("subtotal", data.get("subtotal"))
|
||||
tax = detail.get("tax", data.get("tax"))
|
||||
savings = detail.get("totalSavings", data.get("savings", data.get("totalDiscount")))
|
||||
|
||||
return {
|
||||
"receipt_id": raw.receipt_id,
|
||||
"purchase_date": raw.purchase_date,
|
||||
"total": total,
|
||||
"subtotal": _to_decimal(subtotal) if subtotal is not None else None,
|
||||
"tax": _to_decimal(tax) if tax is not None else None,
|
||||
"savings_total": _to_decimal(savings) if savings is not None else None,
|
||||
"source_url": raw.source_url,
|
||||
"raw_data": data,
|
||||
"items": items,
|
||||
}
|
||||
@@ -0,0 +1,191 @@
|
||||
"""Target Circle receipt parser.
|
||||
|
||||
Transforms raw Target in-store receipt JSON into the common PurchaseCreate schema.
|
||||
Target receipt data includes Circle pricing, BOGO deals, and Circle rewards
|
||||
discounts that need special handling.
|
||||
|
||||
Target receipt detail structure (reverse-engineered from target.com SPA):
|
||||
|
||||
{
|
||||
"orderId": "TGT-2026-0315-7890",
|
||||
"items": [
|
||||
{
|
||||
"description": "GOOD & GATHER WHOLE MILK GAL",
|
||||
"tcin": "14767459",
|
||||
"upc": "0085239100123",
|
||||
"quantity": 1,
|
||||
"unitPrice": 3.89,
|
||||
"totalPrice": 3.89,
|
||||
"regularPrice": 4.19,
|
||||
"circlePrice": 3.89,
|
||||
"couponDiscount": 0.0,
|
||||
"circleRewardsDiscount": 0.30,
|
||||
"promoDescription": "Circle offer: Save 30c",
|
||||
"department": "GROCERY"
|
||||
}
|
||||
],
|
||||
"subtotal": 78.32,
|
||||
"tax": 4.89,
|
||||
"total": 83.21,
|
||||
"totalSavings": 11.45
|
||||
}
|
||||
"""
|
||||
|
||||
import logging
|
||||
from decimal import Decimal, InvalidOperation
|
||||
|
||||
from receiptwitness.scrapers.base import RawReceipt
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _to_decimal(value, default: str = "0") -> Decimal:
|
||||
"""Safely convert a value to Decimal."""
|
||||
if value is None:
|
||||
return Decimal(default)
|
||||
try:
|
||||
return Decimal(str(value))
|
||||
except (InvalidOperation, ValueError, TypeError):
|
||||
return Decimal(default)
|
||||
|
||||
|
||||
def _parse_item(item: dict) -> dict:
|
||||
"""Parse a single line item from a Target receipt.
|
||||
|
||||
Target items may include fields like:
|
||||
- description / itemDescription / productName
|
||||
- tcin (Target internal product ID) / upc / dpci
|
||||
- quantity / qty
|
||||
- unitPrice / price
|
||||
- totalPrice / extendedPrice / lineTotal
|
||||
- regularPrice / originalPrice
|
||||
- circlePrice / salePrice / promoPrice
|
||||
- couponDiscount / couponSavings
|
||||
- circleRewardsDiscount / circleDiscount / loyaltyDiscount
|
||||
- promoDescription / offerDescription (e.g. "BOGO 50% off", "Circle offer")
|
||||
- department / category
|
||||
"""
|
||||
description = (
|
||||
item.get("description")
|
||||
or item.get("itemDescription")
|
||||
or item.get("productName")
|
||||
or item.get("name")
|
||||
or "UNKNOWN ITEM"
|
||||
)
|
||||
|
||||
quantity = _to_decimal(item.get("quantity", item.get("qty", item.get("quantitySold", 1))), "1")
|
||||
unit_price = _to_decimal(item.get("unitPrice", item.get("price", item.get("basePrice", 0))))
|
||||
extended_price = _to_decimal(
|
||||
item.get("totalPrice", item.get("extendedPrice", item.get("lineTotal")))
|
||||
)
|
||||
|
||||
# Compute extended_price if not provided
|
||||
if extended_price == Decimal("0") and unit_price != Decimal("0"):
|
||||
extended_price = unit_price * quantity
|
||||
|
||||
regular_price = item.get("regularPrice", item.get("originalPrice"))
|
||||
# Target Circle pricing — circlePrice takes precedence over generic salePrice
|
||||
sale_price = item.get("circlePrice", item.get("salePrice", item.get("promoPrice")))
|
||||
coupon_discount = item.get(
|
||||
"couponDiscount", item.get("couponSavings", item.get("couponAmount"))
|
||||
)
|
||||
# Circle rewards / loyalty discount
|
||||
loyalty_discount = item.get(
|
||||
"circleRewardsDiscount",
|
||||
item.get("circleDiscount", item.get("loyaltyDiscount")),
|
||||
)
|
||||
|
||||
# UPC handling — Target may use tcin, upc, or dpci
|
||||
upc = item.get("upc", item.get("UPC"))
|
||||
if upc:
|
||||
upc = str(upc).strip().lstrip("0") or None
|
||||
|
||||
# Target also has TCIN (Target.com Item Number) and DPCI (Department/Class/Item)
|
||||
tcin = item.get("tcin", item.get("TCIN"))
|
||||
dpci = item.get("dpci", item.get("DPCI"))
|
||||
|
||||
category = item.get("department", item.get("category"))
|
||||
|
||||
# Capture promo/deal description for BOGO and Circle offers
|
||||
promo_description = item.get("promoDescription", item.get("offerDescription"))
|
||||
|
||||
# Weight info for produce/deli items
|
||||
weight = item.get("weight", item.get("netWeight"))
|
||||
extra: dict = {}
|
||||
if weight is not None:
|
||||
extra["weight"] = str(weight)
|
||||
weight_uom = item.get("weightUom", item.get("unitOfMeasure"))
|
||||
if weight_uom:
|
||||
extra["weight_uom"] = weight_uom
|
||||
if tcin:
|
||||
extra["tcin"] = str(tcin)
|
||||
if dpci:
|
||||
extra["dpci"] = str(dpci)
|
||||
if promo_description:
|
||||
extra["promo_description"] = promo_description
|
||||
|
||||
result: dict = {
|
||||
"product_name_raw": description.strip(),
|
||||
"upc": upc,
|
||||
"quantity": quantity,
|
||||
"unit_price": unit_price,
|
||||
"extended_price": extended_price,
|
||||
"regular_price": _to_decimal(regular_price) if regular_price is not None else None,
|
||||
"sale_price": _to_decimal(sale_price) if sale_price is not None else None,
|
||||
"coupon_discount": (_to_decimal(coupon_discount) if coupon_discount is not None else None),
|
||||
"loyalty_discount": (
|
||||
_to_decimal(loyalty_discount) if loyalty_discount is not None else None
|
||||
),
|
||||
"category_raw": category.strip() if category else None,
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def parse_target_receipt(raw: RawReceipt) -> dict:
|
||||
"""Parse a RawReceipt from Target into a PurchaseCreate-compatible dict."""
|
||||
data = raw.raw_data
|
||||
detail = data.get("detail", {})
|
||||
|
||||
# Parse items — Target uses "items" or "lineItems"
|
||||
raw_items = detail.get("items", detail.get("lineItems", []))
|
||||
items = []
|
||||
for raw_item in raw_items:
|
||||
# Skip voided / returned items
|
||||
if raw_item.get("voided") or raw_item.get("status") in (
|
||||
"VOIDED",
|
||||
"RETURNED",
|
||||
"CANCELLED",
|
||||
):
|
||||
logger.debug("Skipping voided/returned item: %s", raw_item.get("description"))
|
||||
continue
|
||||
if raw_item.get("returnFlag") or raw_item.get("isReturn"):
|
||||
logger.debug("Skipping returned item: %s", raw_item.get("description"))
|
||||
continue
|
||||
items.append(_parse_item(raw_item))
|
||||
|
||||
# Parse totals
|
||||
total = _to_decimal(
|
||||
detail.get(
|
||||
"total",
|
||||
data.get("total", data.get("orderTotal", data.get("grandTotal", 0))),
|
||||
)
|
||||
)
|
||||
subtotal = detail.get("subtotal", data.get("subtotal", data.get("subTotal")))
|
||||
tax = detail.get("tax", data.get("tax", data.get("salesTax")))
|
||||
savings = detail.get(
|
||||
"totalSavings",
|
||||
data.get("savings", data.get("totalDiscount", data.get("circleSavings"))),
|
||||
)
|
||||
|
||||
return {
|
||||
"receipt_id": raw.receipt_id,
|
||||
"purchase_date": raw.purchase_date,
|
||||
"total": total,
|
||||
"subtotal": _to_decimal(subtotal) if subtotal is not None else None,
|
||||
"tax": _to_decimal(tax) if tax is not None else None,
|
||||
"savings_total": _to_decimal(savings) if savings is not None else None,
|
||||
"source_url": raw.source_url,
|
||||
"raw_data": data,
|
||||
"items": items,
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
"""Receipt & product matching pipeline — receipt normalization and product dedup."""
|
||||
|
||||
from receiptwitness.pipeline.matching import (
|
||||
ConfidenceLevel,
|
||||
ProductMatcher,
|
||||
match_purchase_item,
|
||||
)
|
||||
from receiptwitness.pipeline.normalization import (
|
||||
MatchMethod,
|
||||
MatchResult,
|
||||
clean_name,
|
||||
extract_size_info,
|
||||
jaccard_similarity,
|
||||
normalize_product,
|
||||
)
|
||||
from receiptwitness.pipeline.receipt import normalize_receipt, parse_meijer_item
|
||||
|
||||
__all__ = [
|
||||
"ConfidenceLevel",
|
||||
"MatchMethod",
|
||||
"MatchResult",
|
||||
"ProductMatcher",
|
||||
"clean_name",
|
||||
"extract_size_info",
|
||||
"jaccard_similarity",
|
||||
"match_purchase_item",
|
||||
"normalize_product",
|
||||
"normalize_receipt",
|
||||
"parse_meijer_item",
|
||||
]
|
||||
@@ -0,0 +1,136 @@
|
||||
"""Product matching & dedup — UPC primary, fuzzy name fallback, confidence scoring.
|
||||
|
||||
Wraps the Phase 1 normalization module with confidence-level classification
|
||||
and batch matching for purchase ingestion.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
|
||||
from cartsnitch_common.constants import MatchConfidence
|
||||
from cartsnitch_common.models.product import NormalizedProduct
|
||||
from cartsnitch_common.schemas.purchase import PurchaseItemCreate
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from receiptwitness.pipeline.normalization import (
|
||||
MatchMethod,
|
||||
MatchResult,
|
||||
extract_size_info,
|
||||
normalize_product,
|
||||
)
|
||||
|
||||
# Re-export for convenience
|
||||
ConfidenceLevel = MatchConfidence
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MatchOutcome:
|
||||
"""Result of matching a single purchase item to a normalized product."""
|
||||
|
||||
item_index: int
|
||||
match: MatchResult | None
|
||||
confidence_level: MatchConfidence
|
||||
created_new: bool = False
|
||||
|
||||
|
||||
def classify_confidence(score: float, method: MatchMethod) -> MatchConfidence:
|
||||
"""Classify a match score into high/medium/low confidence."""
|
||||
if method == MatchMethod.UPC:
|
||||
return MatchConfidence.HIGH
|
||||
# Name-based matching thresholds
|
||||
if score >= 0.8:
|
||||
return MatchConfidence.HIGH
|
||||
if score >= 0.5:
|
||||
return MatchConfidence.MEDIUM
|
||||
return MatchConfidence.LOW
|
||||
|
||||
|
||||
def _create_product_from_item(
|
||||
session: Session,
|
||||
item: PurchaseItemCreate,
|
||||
) -> NormalizedProduct:
|
||||
"""Create a new NormalizedProduct from a purchase item that had no match."""
|
||||
size_info = extract_size_info(item.product_name_raw)
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name=item.product_name_raw,
|
||||
size=size_info[0] if size_info else None,
|
||||
size_unit=size_info[1] if size_info else None,
|
||||
upc_variants=[item.upc] if item.upc else [],
|
||||
)
|
||||
session.add(product)
|
||||
session.flush()
|
||||
return product
|
||||
|
||||
|
||||
class ProductMatcher:
|
||||
"""Batch product matcher for purchase ingestion.
|
||||
|
||||
Usage:
|
||||
matcher = ProductMatcher(session)
|
||||
outcomes = matcher.match_items(items)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
session: Session,
|
||||
name_threshold: float = 0.4,
|
||||
auto_create: bool = True,
|
||||
):
|
||||
self.session = session
|
||||
self.name_threshold = name_threshold
|
||||
self.auto_create = auto_create
|
||||
|
||||
def match_single(
|
||||
self,
|
||||
item: PurchaseItemCreate,
|
||||
) -> tuple[NormalizedProduct | None, MatchResult | None, MatchConfidence]:
|
||||
"""Match a single purchase item to a normalized product.
|
||||
|
||||
Returns (product, match_result, confidence_level).
|
||||
If auto_create is True and no match found, creates a new product.
|
||||
"""
|
||||
result = normalize_product(
|
||||
self.session,
|
||||
item.product_name_raw,
|
||||
upc=item.upc,
|
||||
name_threshold=self.name_threshold,
|
||||
)
|
||||
|
||||
if result:
|
||||
confidence = classify_confidence(result.confidence, result.method)
|
||||
return result.product, result, confidence
|
||||
|
||||
if self.auto_create:
|
||||
product = _create_product_from_item(self.session, item)
|
||||
return product, None, MatchConfidence.LOW
|
||||
|
||||
return None, None, MatchConfidence.LOW
|
||||
|
||||
def match_items(self, items: list[PurchaseItemCreate]) -> list[MatchOutcome]:
|
||||
"""Match a batch of purchase items. Returns outcomes in order."""
|
||||
outcomes: list[MatchOutcome] = []
|
||||
for idx, item in enumerate(items):
|
||||
product, result, confidence = self.match_single(item)
|
||||
created = result is None and product is not None
|
||||
outcomes.append(
|
||||
MatchOutcome(
|
||||
item_index=idx,
|
||||
match=result,
|
||||
confidence_level=confidence,
|
||||
created_new=created,
|
||||
)
|
||||
)
|
||||
return outcomes
|
||||
|
||||
|
||||
def match_purchase_item(
|
||||
session: Session,
|
||||
item: PurchaseItemCreate,
|
||||
name_threshold: float = 0.4,
|
||||
auto_create: bool = True,
|
||||
) -> tuple[NormalizedProduct | None, MatchConfidence]:
|
||||
"""Convenience function: match a single item, return (product, confidence)."""
|
||||
matcher = ProductMatcher(session, name_threshold=name_threshold, auto_create=auto_create)
|
||||
product, _, confidence = matcher.match_single(item)
|
||||
return product, confidence
|
||||
@@ -0,0 +1,155 @@
|
||||
"""Product normalization — Phase 1: UPC matching + fuzzy name matching.
|
||||
|
||||
Matches products across retailers by:
|
||||
1. Exact UPC match (highest confidence)
|
||||
2. Fuzzy name matching via token-based Jaccard similarity (lower confidence)
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from enum import StrEnum
|
||||
|
||||
from cartsnitch_common.models.product import NormalizedProduct
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
|
||||
class MatchMethod(StrEnum):
|
||||
"""How a product match was determined."""
|
||||
|
||||
UPC = "upc"
|
||||
NAME = "name"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MatchResult:
|
||||
"""Result of a product normalization attempt."""
|
||||
|
||||
product: NormalizedProduct
|
||||
confidence: float
|
||||
method: MatchMethod
|
||||
|
||||
|
||||
# Noise words stripped during name cleaning
|
||||
_NOISE_WORDS = frozenset(
|
||||
{
|
||||
"the",
|
||||
"a",
|
||||
"an",
|
||||
"and",
|
||||
"or",
|
||||
"of",
|
||||
"with",
|
||||
"in",
|
||||
"for",
|
||||
"to",
|
||||
"brand",
|
||||
"original",
|
||||
"classic",
|
||||
"new",
|
||||
"improved",
|
||||
}
|
||||
)
|
||||
|
||||
# Regex for extracting size info (e.g., "16 oz", "1.5 lb", "12 ct")
|
||||
_SIZE_PATTERN = re.compile(
|
||||
r"(\d+(?:\.\d+)?)\s*(oz|fl\s*oz|lb|lbs|g|kg|ml|l|ct|pk|count|pack)\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def clean_name(name: str) -> str:
|
||||
"""Normalize a product name for comparison.
|
||||
|
||||
- Lowercase
|
||||
- Remove size info (e.g., "16 oz")
|
||||
- Strip noise words
|
||||
- Collapse whitespace
|
||||
"""
|
||||
cleaned = name.lower()
|
||||
cleaned = _SIZE_PATTERN.sub("", cleaned)
|
||||
cleaned = re.sub(r"[^\w\s]", " ", cleaned)
|
||||
tokens = cleaned.split()
|
||||
tokens = [t for t in tokens if t not in _NOISE_WORDS]
|
||||
return " ".join(tokens)
|
||||
|
||||
|
||||
def extract_size_info(name: str) -> tuple[str, str] | None:
|
||||
"""Extract (size, unit) from a product name, if present."""
|
||||
match = _SIZE_PATTERN.search(name)
|
||||
if match:
|
||||
return match.group(1), match.group(2).lower().replace(" ", "_")
|
||||
return None
|
||||
|
||||
|
||||
def jaccard_similarity(a: str, b: str) -> float:
|
||||
"""Token-based Jaccard similarity between two cleaned names."""
|
||||
tokens_a = set(a.split())
|
||||
tokens_b = set(b.split())
|
||||
if not tokens_a or not tokens_b:
|
||||
return 0.0
|
||||
intersection = tokens_a & tokens_b
|
||||
union = tokens_a | tokens_b
|
||||
return len(intersection) / len(union)
|
||||
|
||||
|
||||
def match_by_upc(session: Session, upc: str) -> MatchResult | None:
|
||||
"""Find a normalized product by exact UPC match.
|
||||
|
||||
Loads products with upc_variants and checks membership in Python
|
||||
for cross-database compatibility (works on both PostgreSQL and SQLite).
|
||||
"""
|
||||
# TODO: Use PostgreSQL JSON containment query (@>) for production.
|
||||
# Current approach loads all products into memory — acceptable for tests
|
||||
# and small datasets, but will not scale.
|
||||
stmt = select(NormalizedProduct).where(NormalizedProduct.upc_variants.is_not(None))
|
||||
products = session.execute(stmt).scalars().all()
|
||||
for product in products:
|
||||
if product.upc_variants and upc in product.upc_variants:
|
||||
return MatchResult(product=product, confidence=1.0, method=MatchMethod.UPC)
|
||||
return None
|
||||
|
||||
|
||||
def match_by_name(
|
||||
session: Session,
|
||||
name: str,
|
||||
threshold: float = 0.5,
|
||||
) -> MatchResult | None:
|
||||
"""Find the best normalized product by fuzzy name matching.
|
||||
|
||||
Loads all normalized products and computes Jaccard similarity.
|
||||
Returns the best match above the threshold, or None.
|
||||
"""
|
||||
# TODO: Use pg_trgm similarity index for production.
|
||||
# Current approach loads all products into memory — acceptable for tests
|
||||
# and small datasets, but will not scale.
|
||||
cleaned = clean_name(name)
|
||||
stmt = select(NormalizedProduct)
|
||||
products = session.execute(stmt).scalars().all()
|
||||
|
||||
best_match: NormalizedProduct | None = None
|
||||
best_score = 0.0
|
||||
|
||||
for product in products:
|
||||
score = jaccard_similarity(cleaned, clean_name(product.canonical_name))
|
||||
if score > best_score and score >= threshold:
|
||||
best_score = score
|
||||
best_match = product
|
||||
|
||||
if best_match:
|
||||
return MatchResult(product=best_match, confidence=best_score, method=MatchMethod.NAME)
|
||||
return None
|
||||
|
||||
|
||||
def normalize_product(
|
||||
session: Session,
|
||||
name: str,
|
||||
upc: str | None = None,
|
||||
name_threshold: float = 0.5,
|
||||
) -> MatchResult | None:
|
||||
"""Full normalization pipeline: UPC first, then fuzzy name fallback."""
|
||||
if upc:
|
||||
result = match_by_upc(session, upc)
|
||||
if result:
|
||||
return result
|
||||
return match_by_name(session, name, threshold=name_threshold)
|
||||
@@ -0,0 +1,144 @@
|
||||
"""Receipt normalization — parse raw Meijer scraper output into purchase records.
|
||||
|
||||
Maps raw receipt fields, cleans product names, extracts quantities/units.
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import date
|
||||
from decimal import Decimal, InvalidOperation
|
||||
|
||||
from cartsnitch_common.schemas.purchase import PurchaseCreate, PurchaseItemCreate
|
||||
|
||||
|
||||
def _clean_product_name(raw: str) -> str:
|
||||
"""Clean raw product name from scraper output."""
|
||||
cleaned = raw.strip()
|
||||
# Remove leading/trailing non-alphanumeric chars
|
||||
cleaned = re.sub(r"^\W+|\W+$", "", cleaned)
|
||||
# Collapse internal whitespace
|
||||
cleaned = re.sub(r"\s+", " ", cleaned)
|
||||
return cleaned
|
||||
|
||||
|
||||
def _safe_decimal(
|
||||
value: str | float | int | Decimal | None,
|
||||
default: Decimal = Decimal("0"),
|
||||
) -> Decimal:
|
||||
"""Safely convert a value to Decimal."""
|
||||
if value is None:
|
||||
return default
|
||||
try:
|
||||
return Decimal(str(value))
|
||||
except (InvalidOperation, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
def parse_meijer_item(raw_item: dict) -> PurchaseItemCreate:
|
||||
"""Parse a single Meijer scraper line item into a PurchaseItemCreate.
|
||||
|
||||
Expected raw_item keys (from Meijer scraper):
|
||||
- description / name: product name
|
||||
- upc / upcCode: UPC barcode
|
||||
- quantity / qty: number of units
|
||||
- unitPrice / price: per-unit price
|
||||
- extendedPrice / totalPrice: line total
|
||||
- regularPrice: shelf price before discounts
|
||||
- salePrice: sale price if applicable
|
||||
- couponAmount / couponDiscount: coupon savings
|
||||
- loyaltyAmount / loyaltyDiscount: loyalty savings
|
||||
- category / department: raw category
|
||||
"""
|
||||
name = raw_item.get("description") or raw_item.get("name") or ""
|
||||
cleaned_name = _clean_product_name(name)
|
||||
|
||||
upc = raw_item.get("upc") or raw_item.get("upcCode")
|
||||
if upc:
|
||||
upc = str(upc).strip().lstrip("0") or str(upc).strip()
|
||||
|
||||
qty = _safe_decimal(
|
||||
raw_item.get("quantity") or raw_item.get("qty"),
|
||||
default=Decimal("1"),
|
||||
)
|
||||
|
||||
unit_price = _safe_decimal(raw_item.get("unitPrice") or raw_item.get("price"))
|
||||
extended = _safe_decimal(raw_item.get("extendedPrice") or raw_item.get("totalPrice"))
|
||||
if extended == Decimal("0") and unit_price > 0:
|
||||
extended = unit_price * qty
|
||||
|
||||
regular = raw_item.get("regularPrice")
|
||||
sale = raw_item.get("salePrice")
|
||||
coupon = raw_item.get("couponAmount") or raw_item.get("couponDiscount")
|
||||
loyalty = raw_item.get("loyaltyAmount") or raw_item.get("loyaltyDiscount")
|
||||
category = raw_item.get("category") or raw_item.get("department")
|
||||
|
||||
return PurchaseItemCreate(
|
||||
product_name_raw=cleaned_name,
|
||||
upc=upc,
|
||||
quantity=qty,
|
||||
unit_price=unit_price,
|
||||
extended_price=extended,
|
||||
regular_price=_safe_decimal(regular) if regular is not None else None,
|
||||
sale_price=_safe_decimal(sale) if sale is not None else None,
|
||||
coupon_discount=_safe_decimal(coupon) if coupon is not None else None,
|
||||
loyalty_discount=_safe_decimal(loyalty) if loyalty is not None else None,
|
||||
category_raw=str(category).strip() if category else None,
|
||||
)
|
||||
|
||||
|
||||
def normalize_receipt(
|
||||
raw_receipt: dict,
|
||||
user_id: str,
|
||||
store_id: str,
|
||||
) -> PurchaseCreate:
|
||||
"""Parse a complete Meijer raw receipt into a PurchaseCreate.
|
||||
|
||||
Expected raw_receipt keys:
|
||||
- receiptId / receipt_id / id: unique receipt identifier
|
||||
- date / purchaseDate / purchase_date: purchase date (YYYY-MM-DD or similar)
|
||||
- total / totalAmount: receipt total
|
||||
- subtotal: pre-tax subtotal
|
||||
- tax / taxAmount: tax amount
|
||||
- savings / totalSavings: total discount savings
|
||||
- items: list of raw line item dicts
|
||||
"""
|
||||
import uuid
|
||||
|
||||
receipt_id = str(
|
||||
raw_receipt.get("receiptId")
|
||||
or raw_receipt.get("receipt_id")
|
||||
or raw_receipt.get("id")
|
||||
or uuid.uuid4()
|
||||
)
|
||||
|
||||
raw_date = (
|
||||
raw_receipt.get("date")
|
||||
or raw_receipt.get("purchaseDate")
|
||||
or raw_receipt.get("purchase_date")
|
||||
)
|
||||
if isinstance(raw_date, str):
|
||||
purchase_date = date.fromisoformat(raw_date[:10])
|
||||
elif isinstance(raw_date, date):
|
||||
purchase_date = raw_date
|
||||
else:
|
||||
purchase_date = date.today()
|
||||
|
||||
total = _safe_decimal(raw_receipt.get("total") or raw_receipt.get("totalAmount"))
|
||||
subtotal = raw_receipt.get("subtotal")
|
||||
tax = raw_receipt.get("tax") or raw_receipt.get("taxAmount")
|
||||
savings = raw_receipt.get("savings") or raw_receipt.get("totalSavings")
|
||||
|
||||
raw_items = raw_receipt.get("items") or []
|
||||
items = [parse_meijer_item(item) for item in raw_items]
|
||||
|
||||
return PurchaseCreate(
|
||||
user_id=uuid.UUID(user_id) if isinstance(user_id, str) else user_id,
|
||||
store_id=uuid.UUID(store_id) if isinstance(store_id, str) else store_id,
|
||||
receipt_id=receipt_id,
|
||||
purchase_date=purchase_date,
|
||||
total=total,
|
||||
subtotal=_safe_decimal(subtotal) if subtotal is not None else None,
|
||||
tax=_safe_decimal(tax) if tax is not None else None,
|
||||
savings_total=_safe_decimal(savings) if savings is not None else None,
|
||||
raw_data=raw_receipt,
|
||||
items=items,
|
||||
)
|
||||
@@ -0,0 +1 @@
|
||||
"""Retailer scrapers."""
|
||||
@@ -0,0 +1,72 @@
|
||||
"""Abstract base scraper interface for all retailer scrapers."""
|
||||
|
||||
import asyncio
|
||||
import random
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
|
||||
from receiptwitness.config import settings
|
||||
|
||||
|
||||
@dataclass
|
||||
class SessionData:
|
||||
"""Holds session cookies and metadata for a retailer login."""
|
||||
|
||||
cookies: list[dict]
|
||||
user_agent: str
|
||||
created_at: datetime
|
||||
expires_at: datetime | None = None
|
||||
extra: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RawReceipt:
|
||||
"""Raw receipt data before parsing."""
|
||||
|
||||
receipt_id: str
|
||||
purchase_date: str
|
||||
store_number: str | None = None
|
||||
raw_data: dict = field(default_factory=dict)
|
||||
source_url: str | None = None
|
||||
|
||||
|
||||
class BaseScraper(ABC):
|
||||
"""All retailer scrapers implement this interface.
|
||||
|
||||
Provides common functionality: human-like delays, rate limiting guards,
|
||||
and the abstract methods each retailer scraper must implement.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def login(self, username: str, password: str) -> SessionData:
|
||||
"""Authenticate with the retailer portal and return session data."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
async def check_session(self, session: SessionData) -> bool:
|
||||
"""Verify if an existing session is still valid."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
async def scrape_receipts(
|
||||
self, session: SessionData, since: datetime | None = None
|
||||
) -> list[RawReceipt]:
|
||||
"""Scrape receipt data from the retailer portal."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def parse_receipt(self, raw: RawReceipt) -> dict:
|
||||
"""Parse a raw receipt into structured data.
|
||||
|
||||
Returns a dict with keys matching PurchaseCreate schema fields,
|
||||
including an 'items' list matching PurchaseItemCreate fields.
|
||||
"""
|
||||
...
|
||||
|
||||
async def human_delay(self, min_ms: int | None = None, max_ms: int | None = None) -> None:
|
||||
"""Sleep for a randomized human-like interval."""
|
||||
lo = min_ms or settings.min_request_delay_ms
|
||||
hi = max_ms or settings.max_request_delay_ms
|
||||
delay = random.randint(lo, hi) / 1000.0
|
||||
await asyncio.sleep(delay)
|
||||
@@ -0,0 +1,344 @@
|
||||
"""Kroger loyalty portal scraper using Playwright.
|
||||
|
||||
Kroger uses Akamai Bot Manager for aggressive headless browser detection.
|
||||
This scraper uses enhanced stealth measures including playwright-stealth,
|
||||
realistic fingerprinting, and human-like interaction pacing.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from typing import cast
|
||||
|
||||
from playwright.async_api import BrowserContext, Page, Playwright, async_playwright
|
||||
|
||||
from receiptwitness.config import settings
|
||||
from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Kroger endpoints
|
||||
KROGER_BASE = "https://www.kroger.com"
|
||||
KROGER_LOGIN_PAGE = f"{KROGER_BASE}/signin"
|
||||
KROGER_PURCHASE_HISTORY = f"{KROGER_BASE}/mypurchases"
|
||||
KROGER_RECEIPT_API = f"{KROGER_BASE}/atlas/v1/purchase-history/api"
|
||||
KROGER_RECEIPT_DETAIL_API = f"{KROGER_BASE}/atlas/v1/receipt/api"
|
||||
KROGER_ACCOUNT_PAGE = f"{KROGER_BASE}/account/dashboard"
|
||||
|
||||
# Realistic browser fingerprint — Chrome on Windows (matches Kroger's typical audience)
|
||||
DEFAULT_USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
DEFAULT_VIEWPORT = {"width": 1920, "height": 1080}
|
||||
DEFAULT_LOCALE = "en-US"
|
||||
DEFAULT_TIMEZONE = "America/New_York"
|
||||
|
||||
|
||||
class KrogerScraper(BaseScraper):
|
||||
"""Scraper for Kroger loyalty purchase history.
|
||||
|
||||
Kroger uses Akamai Bot Manager which aggressively detects headless
|
||||
browsers. This scraper employs enhanced stealth measures:
|
||||
- Masks webdriver/automation signals
|
||||
- Sets realistic browser fingerprint
|
||||
- Uses human-like interaction pacing
|
||||
- Preserves browser context across sessions
|
||||
"""
|
||||
|
||||
async def _create_stealth_context(
|
||||
self, playwright_instance: Playwright, cookies: list[dict] | None = None
|
||||
) -> BrowserContext:
|
||||
"""Create a browser context with enhanced stealth for Akamai evasion."""
|
||||
browser = await playwright_instance.chromium.launch(
|
||||
headless=settings.headless,
|
||||
args=[
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-infobars",
|
||||
"--window-size=1920,1080",
|
||||
],
|
||||
)
|
||||
context = await browser.new_context(
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
viewport=DEFAULT_VIEWPORT, # type: ignore[arg-type]
|
||||
locale=DEFAULT_LOCALE,
|
||||
timezone_id=DEFAULT_TIMEZONE,
|
||||
java_script_enabled=True,
|
||||
bypass_csp=False,
|
||||
color_scheme="light",
|
||||
has_touch=False,
|
||||
)
|
||||
|
||||
# Enhanced stealth script targeting Akamai Bot Manager detection vectors
|
||||
await context.add_init_script(
|
||||
"""
|
||||
// Mask webdriver flag
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
});
|
||||
|
||||
// Chrome runtime object
|
||||
window.chrome = {
|
||||
runtime: {},
|
||||
loadTimes: function() {},
|
||||
csi: function() {},
|
||||
app: { isInstalled: false }
|
||||
};
|
||||
|
||||
// Realistic plugin array
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5]
|
||||
});
|
||||
|
||||
// Languages
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['en-US', 'en']
|
||||
});
|
||||
|
||||
// Platform
|
||||
Object.defineProperty(navigator, 'platform', {
|
||||
get: () => 'Win32'
|
||||
});
|
||||
|
||||
// Hardware concurrency
|
||||
Object.defineProperty(navigator, 'hardwareConcurrency', {
|
||||
get: () => 8
|
||||
});
|
||||
|
||||
// Device memory
|
||||
Object.defineProperty(navigator, 'deviceMemory', {
|
||||
get: () => 8
|
||||
});
|
||||
|
||||
// Permissions query override (Akamai checks this)
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
window.navigator.permissions.query = (parameters) =>
|
||||
parameters.name === 'notifications'
|
||||
? Promise.resolve({ state: Notification.permission })
|
||||
: originalQuery(parameters);
|
||||
|
||||
// WebGL vendor/renderer (avoid "Google Inc." / "ANGLE" tells)
|
||||
const getParameter = WebGLRenderingContext.prototype.getParameter;
|
||||
WebGLRenderingContext.prototype.getParameter = function(parameter) {
|
||||
if (parameter === 37445) return 'Intel Inc.';
|
||||
if (parameter === 37446) return 'Intel Iris OpenGL Engine';
|
||||
return getParameter.call(this, parameter);
|
||||
};
|
||||
"""
|
||||
)
|
||||
|
||||
if cookies:
|
||||
await context.add_cookies(cookies) # type: ignore[arg-type]
|
||||
|
||||
return cast(BrowserContext, context)
|
||||
|
||||
async def login(self, username: str, password: str) -> SessionData:
|
||||
"""Log in to Kroger and capture session cookies."""
|
||||
async with async_playwright() as p:
|
||||
context = await self._create_stealth_context(p)
|
||||
page = await context.new_page()
|
||||
try:
|
||||
return await self._perform_login(page, context, username, password)
|
||||
finally:
|
||||
if context.browser:
|
||||
await context.browser.close()
|
||||
|
||||
async def _perform_login(
|
||||
self, page: Page, context: BrowserContext, username: str, password: str
|
||||
) -> SessionData:
|
||||
"""Execute the Kroger login flow."""
|
||||
logger.info("Navigating to Kroger sign-in page")
|
||||
await page.goto(KROGER_LOGIN_PAGE, wait_until="networkidle")
|
||||
await self.human_delay(2000, 4000)
|
||||
|
||||
# Kroger login form — email/username field
|
||||
email_input = page.locator(
|
||||
'input[id="SignIn-emailInput"], '
|
||||
'input[name="email"], '
|
||||
'input[type="email"], '
|
||||
'input[data-testid="SignIn-emailInput"]'
|
||||
)
|
||||
await email_input.wait_for(state="visible", timeout=settings.browser_timeout_ms)
|
||||
await email_input.click()
|
||||
await self.human_delay(300, 700)
|
||||
await email_input.fill(username)
|
||||
await self.human_delay(800, 1500)
|
||||
|
||||
# Password field
|
||||
password_input = page.locator(
|
||||
'input[id="SignIn-passwordInput"], '
|
||||
'input[name="password"], '
|
||||
'input[type="password"], '
|
||||
'input[data-testid="SignIn-passwordInput"]'
|
||||
)
|
||||
await password_input.wait_for(state="visible", timeout=settings.browser_timeout_ms)
|
||||
await password_input.click()
|
||||
await self.human_delay(300, 700)
|
||||
await password_input.fill(password)
|
||||
await self.human_delay(1000, 2000)
|
||||
|
||||
# Sign-in button
|
||||
sign_in_btn = page.locator(
|
||||
'button[id="SignIn-submitButton"], '
|
||||
'button[data-testid="SignIn-submitButton"], '
|
||||
'button[type="submit"]:has-text("Sign In")'
|
||||
)
|
||||
await sign_in_btn.click()
|
||||
|
||||
# Wait for redirect away from sign-in page
|
||||
await page.wait_for_url(
|
||||
lambda url: "signin" not in url.lower(),
|
||||
timeout=settings.browser_timeout_ms,
|
||||
)
|
||||
await self.human_delay(1500, 3000)
|
||||
|
||||
# Capture cookies
|
||||
raw_cookies = await context.cookies()
|
||||
cookies = [dict(c) for c in raw_cookies]
|
||||
now = datetime.now(UTC)
|
||||
|
||||
logger.info("Kroger login successful, captured %d cookies", len(cookies))
|
||||
return SessionData(
|
||||
cookies=cookies,
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
created_at=now,
|
||||
expires_at=now + timedelta(hours=2),
|
||||
extra={"retailer": "kroger"},
|
||||
)
|
||||
|
||||
async def check_session(self, session: SessionData) -> bool:
|
||||
"""Check if the Kroger session is still valid."""
|
||||
if session.expires_at and datetime.now(UTC) > session.expires_at:
|
||||
logger.info("Kroger session expired based on timestamp")
|
||||
return False
|
||||
|
||||
async with async_playwright() as p:
|
||||
context = await self._create_stealth_context(p, cookies=session.cookies)
|
||||
page = await context.new_page()
|
||||
try:
|
||||
response = await page.goto(KROGER_ACCOUNT_PAGE, wait_until="networkidle")
|
||||
current_url = page.url.lower()
|
||||
is_valid = "signin" not in current_url and response is not None and response.ok
|
||||
logger.info("Kroger session check: valid=%s (url=%s)", is_valid, page.url)
|
||||
return is_valid
|
||||
except Exception:
|
||||
logger.exception("Kroger session check failed")
|
||||
return False
|
||||
finally:
|
||||
if context.browser:
|
||||
await context.browser.close()
|
||||
|
||||
async def scrape_receipts(
|
||||
self, session: SessionData, since: datetime | None = None
|
||||
) -> list[RawReceipt]:
|
||||
"""Scrape purchase history from Kroger."""
|
||||
async with async_playwright() as p:
|
||||
context = await self._create_stealth_context(p, cookies=session.cookies)
|
||||
page = await context.new_page()
|
||||
try:
|
||||
return await self._fetch_receipts(page, since)
|
||||
finally:
|
||||
if context.browser:
|
||||
await context.browser.close()
|
||||
|
||||
async def _fetch_receipts(self, page: Page, since: datetime | None) -> list[RawReceipt]:
|
||||
"""Fetch receipt list and details from Kroger purchase history."""
|
||||
# Navigate to purchase history to establish context
|
||||
await page.goto(KROGER_PURCHASE_HISTORY, wait_until="networkidle")
|
||||
await self.human_delay(1500, 3000)
|
||||
|
||||
receipts: list[RawReceipt] = []
|
||||
|
||||
# Kroger purchase history API endpoint
|
||||
api_response = await page.request.get(KROGER_RECEIPT_API)
|
||||
if not api_response.ok:
|
||||
logger.warning(
|
||||
"Kroger purchase history request failed: %d %s",
|
||||
api_response.status,
|
||||
api_response.status_text,
|
||||
)
|
||||
return []
|
||||
|
||||
response = await api_response.json()
|
||||
if not isinstance(response, dict):
|
||||
logger.warning("Unexpected purchase history response type: %s", type(response))
|
||||
return []
|
||||
|
||||
# Handle Kroger's response structure
|
||||
orders = response.get("orders", response.get("purchases", []))
|
||||
if not isinstance(orders, list):
|
||||
logger.warning("No orders found in Kroger purchase history response")
|
||||
return []
|
||||
|
||||
logger.info("Found %d orders in Kroger purchase history", len(orders))
|
||||
|
||||
for order in orders:
|
||||
raw_id = order.get("orderId") or order.get("receiptId") or order.get("id") or ""
|
||||
order_id = str(raw_id)
|
||||
purchase_date = order.get(
|
||||
"purchaseDate", order.get("transactionDate", order.get("date", ""))
|
||||
)
|
||||
|
||||
# Filter by date if 'since' is provided
|
||||
if since and purchase_date:
|
||||
try:
|
||||
txn_dt = datetime.fromisoformat(purchase_date.replace("Z", "+00:00"))
|
||||
if txn_dt < since:
|
||||
continue
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
if not order_id:
|
||||
continue
|
||||
|
||||
await self.human_delay(1000, 2500)
|
||||
|
||||
# Fetch receipt detail
|
||||
detail = await self._fetch_receipt_detail(page, order_id)
|
||||
|
||||
raw_store = (
|
||||
order.get("storeNumber")
|
||||
or order.get("divisionNumber")
|
||||
or order.get("storeId")
|
||||
or ""
|
||||
)
|
||||
store_number = str(raw_store)
|
||||
|
||||
receipts.append(
|
||||
RawReceipt(
|
||||
receipt_id=order_id,
|
||||
purchase_date=purchase_date,
|
||||
store_number=store_number,
|
||||
raw_data={**order, "detail": detail},
|
||||
source_url=f"{KROGER_RECEIPT_DETAIL_API}?orderId={order_id}",
|
||||
)
|
||||
)
|
||||
|
||||
logger.info("Scraped %d receipts from Kroger", len(receipts))
|
||||
return receipts
|
||||
|
||||
async def _fetch_receipt_detail(self, page: Page, order_id: str) -> dict:
|
||||
"""Fetch detailed receipt data for a single Kroger order."""
|
||||
try:
|
||||
url = f"{KROGER_RECEIPT_DETAIL_API}?orderId={order_id}"
|
||||
api_response = await page.request.get(url)
|
||||
if not api_response.ok:
|
||||
logger.warning(
|
||||
"Kroger receipt detail request failed for %s: %d",
|
||||
order_id,
|
||||
api_response.status,
|
||||
)
|
||||
return {}
|
||||
detail = await api_response.json()
|
||||
return detail if isinstance(detail, dict) else {}
|
||||
except Exception:
|
||||
logger.exception("Failed to fetch Kroger receipt detail for %s", order_id)
|
||||
return {}
|
||||
|
||||
def parse_receipt(self, raw: RawReceipt) -> dict:
|
||||
"""Parse raw Kroger receipt into structured purchase data."""
|
||||
from receiptwitness.parsers.kroger import parse_kroger_receipt
|
||||
|
||||
return parse_kroger_receipt(raw)
|
||||
@@ -0,0 +1,301 @@
|
||||
"""Meijer mPerks scraper using Playwright.
|
||||
|
||||
Meijer has no public API. We reverse-engineer the XHR endpoints the mPerks
|
||||
web app uses to pull purchase history and receipt data. The flow:
|
||||
|
||||
1. Launch stealth Playwright browser
|
||||
2. Navigate to mPerks login page and authenticate
|
||||
3. Capture session cookies after successful login
|
||||
4. Use those cookies to hit the mPerks receipt API endpoints directly
|
||||
5. Parse receipt JSON into structured PurchaseCreate records
|
||||
|
||||
Key endpoints (reverse-engineered from mPerks SPA):
|
||||
- Login: POST https://www.meijer.com/bin/meijer/account/login
|
||||
- Receipts: GET https://www.meijer.com/bin/meijer/profile/purchasehistory
|
||||
- Receipt detail: GET https://www.meijer.com/bin/meijer/profile/receipt?receiptId=...
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from typing import cast
|
||||
|
||||
from playwright.async_api import BrowserContext, Page, Playwright, async_playwright
|
||||
|
||||
from receiptwitness.config import settings
|
||||
from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Meijer mPerks URLs
|
||||
MEIJER_BASE = "https://www.meijer.com"
|
||||
MEIJER_LOGIN_PAGE = f"{MEIJER_BASE}/shopping/login.html"
|
||||
MEIJER_LOGIN_API = f"{MEIJER_BASE}/bin/meijer/account/login"
|
||||
MEIJER_PURCHASE_HISTORY = f"{MEIJER_BASE}/bin/meijer/profile/purchasehistory"
|
||||
MEIJER_RECEIPT_DETAIL = f"{MEIJER_BASE}/bin/meijer/profile/receipt"
|
||||
MEIJER_MPERKS_HOME = f"{MEIJER_BASE}/mperks.html"
|
||||
|
||||
# Realistic browser fingerprint
|
||||
DEFAULT_USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
DEFAULT_VIEWPORT = {"width": 1920, "height": 1080}
|
||||
DEFAULT_LOCALE = "en-US"
|
||||
DEFAULT_TIMEZONE = "America/Detroit" # Meijer HQ is in Grand Rapids, MI
|
||||
|
||||
|
||||
class MeijerScraper(BaseScraper):
|
||||
"""Scraper for Meijer mPerks purchase history."""
|
||||
|
||||
async def _create_stealth_context(
|
||||
self, playwright_instance: Playwright, cookies: list[dict] | None = None
|
||||
) -> BrowserContext:
|
||||
"""Create a browser context with stealth settings."""
|
||||
browser = await playwright_instance.chromium.launch(
|
||||
headless=settings.headless,
|
||||
args=[
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--no-sandbox",
|
||||
],
|
||||
)
|
||||
context = await browser.new_context(
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
viewport=DEFAULT_VIEWPORT, # type: ignore[arg-type]
|
||||
locale=DEFAULT_LOCALE,
|
||||
timezone_id=DEFAULT_TIMEZONE,
|
||||
java_script_enabled=True,
|
||||
bypass_csp=False,
|
||||
)
|
||||
# Mask webdriver flag
|
||||
await context.add_init_script(
|
||||
"""
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
});
|
||||
// Mask chrome automation indicators
|
||||
window.chrome = { runtime: {} };
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5]
|
||||
});
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['en-US', 'en']
|
||||
});
|
||||
"""
|
||||
)
|
||||
if cookies:
|
||||
await context.add_cookies(cookies) # type: ignore[arg-type]
|
||||
return cast(BrowserContext, context)
|
||||
|
||||
async def login(self, username: str, password: str) -> SessionData:
|
||||
"""Log in to Meijer mPerks and capture session cookies.
|
||||
|
||||
The mPerks login flow:
|
||||
1. Navigate to login page
|
||||
2. Fill email and password fields
|
||||
3. Click sign-in button
|
||||
4. Wait for redirect to mPerks dashboard
|
||||
5. Extract session cookies
|
||||
"""
|
||||
async with async_playwright() as p:
|
||||
context = await self._create_stealth_context(p)
|
||||
page = await context.new_page()
|
||||
try:
|
||||
return await self._perform_login(page, context, username, password)
|
||||
finally:
|
||||
if context.browser:
|
||||
await context.browser.close()
|
||||
|
||||
async def _perform_login(
|
||||
self, page: Page, context: BrowserContext, username: str, password: str
|
||||
) -> SessionData:
|
||||
"""Execute the login flow on the mPerks portal."""
|
||||
logger.info("Navigating to Meijer login page")
|
||||
await page.goto(MEIJER_LOGIN_PAGE, wait_until="networkidle")
|
||||
await self.human_delay(1500, 3000)
|
||||
|
||||
# Fill email field
|
||||
email_input = page.locator('input[type="email"], input[name="email"], #email')
|
||||
await email_input.wait_for(state="visible", timeout=settings.browser_timeout_ms)
|
||||
await email_input.click()
|
||||
await self.human_delay(200, 500)
|
||||
await email_input.fill(username)
|
||||
await self.human_delay(500, 1000)
|
||||
|
||||
# Fill password field
|
||||
password_input = page.locator('input[type="password"], input[name="password"], #password')
|
||||
await password_input.wait_for(state="visible", timeout=settings.browser_timeout_ms)
|
||||
await password_input.click()
|
||||
await self.human_delay(200, 500)
|
||||
await password_input.fill(password)
|
||||
await self.human_delay(500, 1500)
|
||||
|
||||
# Click sign-in button
|
||||
sign_in_btn = page.locator(
|
||||
'button[type="submit"], button:has-text("Sign In"), button:has-text("Log In")'
|
||||
)
|
||||
await sign_in_btn.click()
|
||||
|
||||
# Wait for navigation after login
|
||||
await page.wait_for_url(
|
||||
lambda url: "login" not in url.lower(),
|
||||
timeout=settings.browser_timeout_ms,
|
||||
)
|
||||
await self.human_delay(1000, 2000)
|
||||
|
||||
# Capture cookies
|
||||
raw_cookies = await context.cookies()
|
||||
cookies = [dict(c) for c in raw_cookies]
|
||||
now = datetime.now(UTC)
|
||||
|
||||
logger.info("Meijer login successful, captured %d cookies", len(cookies))
|
||||
return SessionData(
|
||||
cookies=cookies,
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
created_at=now,
|
||||
expires_at=now + timedelta(hours=4),
|
||||
)
|
||||
|
||||
async def check_session(self, session: SessionData) -> bool:
|
||||
"""Check if the mPerks session is still valid.
|
||||
|
||||
Makes a lightweight request to the mPerks home page and checks
|
||||
if we get redirected to login (session expired) or not.
|
||||
"""
|
||||
if session.expires_at and datetime.now(UTC) > session.expires_at:
|
||||
logger.info("Meijer session expired based on timestamp")
|
||||
return False
|
||||
|
||||
async with async_playwright() as p:
|
||||
context = await self._create_stealth_context(p, cookies=session.cookies)
|
||||
page = await context.new_page()
|
||||
try:
|
||||
response = await page.goto(MEIJER_MPERKS_HOME, wait_until="networkidle")
|
||||
current_url = page.url.lower()
|
||||
is_valid = "login" not in current_url and response is not None and response.ok
|
||||
logger.info("Meijer session check: valid=%s (url=%s)", is_valid, page.url)
|
||||
return is_valid
|
||||
except Exception:
|
||||
logger.exception("Meijer session check failed")
|
||||
return False
|
||||
finally:
|
||||
if context.browser:
|
||||
await context.browser.close()
|
||||
|
||||
async def scrape_receipts(
|
||||
self, session: SessionData, since: datetime | None = None
|
||||
) -> list[RawReceipt]:
|
||||
"""Scrape purchase history from Meijer mPerks.
|
||||
|
||||
Uses the XHR endpoints the mPerks SPA calls to fetch receipt data.
|
||||
The purchase history endpoint returns a list of recent transactions,
|
||||
and we can fetch individual receipt details for line items.
|
||||
"""
|
||||
async with async_playwright() as p:
|
||||
context = await self._create_stealth_context(p, cookies=session.cookies)
|
||||
page = await context.new_page()
|
||||
try:
|
||||
return await self._fetch_receipts(page, since)
|
||||
finally:
|
||||
if context.browser:
|
||||
await context.browser.close()
|
||||
|
||||
async def _fetch_receipts(self, page: Page, since: datetime | None) -> list[RawReceipt]:
|
||||
"""Fetch receipt list and detail via mPerks XHR endpoints.
|
||||
|
||||
Uses Playwright's page.request API (APIRequestContext) instead of
|
||||
page.evaluate(fetch(...)) for better observability — requests show up
|
||||
in Playwright traces and can be intercepted by route handlers.
|
||||
"""
|
||||
# Navigate to mPerks to establish context (cookies need domain context)
|
||||
await page.goto(MEIJER_MPERKS_HOME, wait_until="networkidle")
|
||||
await self.human_delay(1000, 2000)
|
||||
|
||||
receipts: list[RawReceipt] = []
|
||||
|
||||
# Fetch purchase history listing via page.request (APIRequestContext)
|
||||
api_response = await page.request.get(MEIJER_PURCHASE_HISTORY)
|
||||
if not api_response.ok:
|
||||
logger.warning(
|
||||
"Purchase history request failed: %d %s",
|
||||
api_response.status,
|
||||
api_response.status_text,
|
||||
)
|
||||
return []
|
||||
|
||||
response = await api_response.json()
|
||||
|
||||
if not isinstance(response, dict):
|
||||
logger.warning("Unexpected purchase history response type: %s", type(response))
|
||||
return []
|
||||
|
||||
transactions = response.get("transactions", response.get("purchaseHistory", []))
|
||||
if not isinstance(transactions, list):
|
||||
logger.warning("No transactions found in purchase history response")
|
||||
return []
|
||||
|
||||
logger.info("Found %d transactions in Meijer purchase history", len(transactions))
|
||||
|
||||
for txn in transactions:
|
||||
receipt_id = str(txn.get("transactionId", txn.get("receiptId", "")))
|
||||
purchase_date = txn.get("transactionDate", txn.get("purchaseDate", ""))
|
||||
|
||||
# Filter by date if 'since' is provided
|
||||
if since and purchase_date:
|
||||
try:
|
||||
txn_dt = datetime.fromisoformat(purchase_date.replace("Z", "+00:00"))
|
||||
if txn_dt < since:
|
||||
continue
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
if not receipt_id:
|
||||
continue
|
||||
|
||||
await self.human_delay(800, 2000)
|
||||
|
||||
# Fetch receipt detail
|
||||
detail = await self._fetch_receipt_detail(page, receipt_id)
|
||||
|
||||
receipts.append(
|
||||
RawReceipt(
|
||||
receipt_id=receipt_id,
|
||||
purchase_date=purchase_date,
|
||||
store_number=str(txn.get("storeNumber", txn.get("storeId", ""))),
|
||||
raw_data={**txn, "detail": detail},
|
||||
source_url=f"{MEIJER_RECEIPT_DETAIL}?receiptId={receipt_id}",
|
||||
)
|
||||
)
|
||||
|
||||
logger.info("Scraped %d receipts from Meijer", len(receipts))
|
||||
return receipts
|
||||
|
||||
async def _fetch_receipt_detail(self, page: Page, receipt_id: str) -> dict:
|
||||
"""Fetch detailed receipt data for a single transaction.
|
||||
|
||||
Uses Playwright's page.request API for traceability.
|
||||
"""
|
||||
try:
|
||||
url = f"{MEIJER_RECEIPT_DETAIL}?receiptId={receipt_id}"
|
||||
api_response = await page.request.get(url)
|
||||
if not api_response.ok:
|
||||
logger.warning(
|
||||
"Receipt detail request failed for %s: %d",
|
||||
receipt_id,
|
||||
api_response.status,
|
||||
)
|
||||
return {}
|
||||
detail = await api_response.json()
|
||||
return detail if isinstance(detail, dict) else {}
|
||||
except Exception:
|
||||
logger.exception("Failed to fetch receipt detail for %s", receipt_id)
|
||||
return {}
|
||||
|
||||
def parse_receipt(self, raw: RawReceipt) -> dict:
|
||||
"""Parse raw Meijer receipt into structured purchase data.
|
||||
|
||||
Delegates to the dedicated parser module.
|
||||
"""
|
||||
from receiptwitness.parsers.meijer import parse_meijer_receipt
|
||||
|
||||
return parse_meijer_receipt(raw)
|
||||
@@ -0,0 +1,326 @@
|
||||
"""Target Circle scraper using Playwright.
|
||||
|
||||
Target stores ~1 year of in-store purchase history tied to Circle accounts.
|
||||
Purchases appear when the user pays with a linked card, uses the Target app
|
||||
wallet, or enters their Circle phone number at checkout.
|
||||
|
||||
Key endpoints (reverse-engineered from target.com SPA):
|
||||
- Login: POST https://gsp.target.com/gsp/authentications/v1/auth_codes
|
||||
- Order history: GET https://api.target.com/order_history/v1/orders (in-store tab)
|
||||
- Receipt detail: GET https://api.target.com/order_history/v1/orders/{orderId}
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from typing import cast
|
||||
|
||||
from playwright.async_api import BrowserContext, Page, Playwright, async_playwright
|
||||
|
||||
from receiptwitness.config import settings
|
||||
from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Target endpoints
|
||||
TARGET_BASE = "https://www.target.com"
|
||||
TARGET_LOGIN_PAGE = f"{TARGET_BASE}/login"
|
||||
TARGET_ACCOUNT_PAGE = f"{TARGET_BASE}/account"
|
||||
TARGET_ORDER_HISTORY = f"{TARGET_BASE}/account/orders"
|
||||
TARGET_ORDER_API = "https://api.target.com/order_history/v1/orders"
|
||||
TARGET_RECEIPT_API = "https://api.target.com/order_history/v1/orders"
|
||||
|
||||
# Realistic browser fingerprint — Chrome on Windows
|
||||
DEFAULT_USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
DEFAULT_VIEWPORT = {"width": 1920, "height": 1080}
|
||||
DEFAULT_LOCALE = "en-US"
|
||||
DEFAULT_TIMEZONE = "America/Detroit" # SE Michigan coverage
|
||||
|
||||
|
||||
class TargetScraper(BaseScraper):
|
||||
"""Scraper for Target Circle in-store purchase history.
|
||||
|
||||
Target's order history SPA loads purchase data from internal API
|
||||
endpoints. This scraper authenticates via the web login flow,
|
||||
captures session cookies, and uses those to hit the order history
|
||||
API for in-store receipt data.
|
||||
"""
|
||||
|
||||
async def _create_stealth_context(
|
||||
self, playwright_instance: Playwright, cookies: list[dict] | None = None
|
||||
) -> BrowserContext:
|
||||
"""Create a browser context with stealth settings for Target."""
|
||||
browser = await playwright_instance.chromium.launch(
|
||||
headless=settings.headless,
|
||||
args=[
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
],
|
||||
)
|
||||
context = await browser.new_context(
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
viewport=DEFAULT_VIEWPORT, # type: ignore[arg-type]
|
||||
locale=DEFAULT_LOCALE,
|
||||
timezone_id=DEFAULT_TIMEZONE,
|
||||
java_script_enabled=True,
|
||||
bypass_csp=False,
|
||||
color_scheme="light",
|
||||
has_touch=False,
|
||||
)
|
||||
# Mask webdriver and automation signals
|
||||
await context.add_init_script(
|
||||
"""
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
});
|
||||
|
||||
window.chrome = {
|
||||
runtime: {},
|
||||
loadTimes: function() {},
|
||||
csi: function() {},
|
||||
app: { isInstalled: false }
|
||||
};
|
||||
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5]
|
||||
});
|
||||
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['en-US', 'en']
|
||||
});
|
||||
|
||||
Object.defineProperty(navigator, 'platform', {
|
||||
get: () => 'Win32'
|
||||
});
|
||||
|
||||
Object.defineProperty(navigator, 'hardwareConcurrency', {
|
||||
get: () => 8
|
||||
});
|
||||
|
||||
Object.defineProperty(navigator, 'deviceMemory', {
|
||||
get: () => 8
|
||||
});
|
||||
"""
|
||||
)
|
||||
if cookies:
|
||||
await context.add_cookies(cookies) # type: ignore[arg-type]
|
||||
return cast(BrowserContext, context)
|
||||
|
||||
async def login(self, username: str, password: str) -> SessionData:
|
||||
"""Log in to Target and capture session cookies."""
|
||||
async with async_playwright() as p:
|
||||
context = await self._create_stealth_context(p)
|
||||
page = await context.new_page()
|
||||
try:
|
||||
return await self._perform_login(page, context, username, password)
|
||||
finally:
|
||||
if context.browser:
|
||||
await context.browser.close()
|
||||
|
||||
async def _perform_login(
|
||||
self, page: Page, context: BrowserContext, username: str, password: str
|
||||
) -> SessionData:
|
||||
"""Execute the Target login flow."""
|
||||
logger.info("Navigating to Target sign-in page")
|
||||
await page.goto(TARGET_LOGIN_PAGE, wait_until="networkidle")
|
||||
await self.human_delay(2000, 4000)
|
||||
|
||||
# Target login form — email/username field
|
||||
email_input = page.locator(
|
||||
'input[id="username"], '
|
||||
'input[name="username"], '
|
||||
'input[type="email"], '
|
||||
'input[data-test="username"]'
|
||||
)
|
||||
await email_input.wait_for(state="visible", timeout=settings.browser_timeout_ms)
|
||||
await email_input.click()
|
||||
await self.human_delay(300, 700)
|
||||
await email_input.fill(username)
|
||||
await self.human_delay(800, 1500)
|
||||
|
||||
# Password field
|
||||
password_input = page.locator(
|
||||
'input[id="password"], '
|
||||
'input[name="password"], '
|
||||
'input[type="password"], '
|
||||
'input[data-test="password"]'
|
||||
)
|
||||
await password_input.wait_for(state="visible", timeout=settings.browser_timeout_ms)
|
||||
await password_input.click()
|
||||
await self.human_delay(300, 700)
|
||||
await password_input.fill(password)
|
||||
await self.human_delay(1000, 2000)
|
||||
|
||||
# Sign-in button
|
||||
sign_in_btn = page.locator(
|
||||
'button[id="login"], '
|
||||
'button[data-test="login-button"], '
|
||||
'button[type="submit"]:has-text("Sign in")'
|
||||
)
|
||||
await sign_in_btn.click()
|
||||
|
||||
# Wait for redirect away from login page
|
||||
await page.wait_for_url(
|
||||
lambda url: "login" not in url.lower(),
|
||||
timeout=settings.browser_timeout_ms,
|
||||
)
|
||||
await self.human_delay(1500, 3000)
|
||||
|
||||
# Capture cookies
|
||||
raw_cookies = await context.cookies()
|
||||
cookies = [dict(c) for c in raw_cookies]
|
||||
now = datetime.now(UTC)
|
||||
|
||||
logger.info("Target login successful, captured %d cookies", len(cookies))
|
||||
return SessionData(
|
||||
cookies=cookies,
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
created_at=now,
|
||||
expires_at=now + timedelta(hours=2),
|
||||
extra={"retailer": "target"},
|
||||
)
|
||||
|
||||
async def check_session(self, session: SessionData) -> bool:
|
||||
"""Check if the Target session is still valid."""
|
||||
if session.expires_at and datetime.now(UTC) > session.expires_at:
|
||||
logger.info("Target session expired based on timestamp")
|
||||
return False
|
||||
|
||||
async with async_playwright() as p:
|
||||
context = await self._create_stealth_context(p, cookies=session.cookies)
|
||||
page = await context.new_page()
|
||||
try:
|
||||
response = await page.goto(TARGET_ACCOUNT_PAGE, wait_until="networkidle")
|
||||
current_url = page.url.lower()
|
||||
is_valid = "login" not in current_url and response is not None and response.ok
|
||||
logger.info("Target session check: valid=%s (url=%s)", is_valid, page.url)
|
||||
return is_valid
|
||||
except Exception:
|
||||
logger.exception("Target session check failed")
|
||||
return False
|
||||
finally:
|
||||
if context.browser:
|
||||
await context.browser.close()
|
||||
|
||||
async def scrape_receipts(
|
||||
self, session: SessionData, since: datetime | None = None
|
||||
) -> list[RawReceipt]:
|
||||
"""Scrape in-store purchase history from Target Circle."""
|
||||
async with async_playwright() as p:
|
||||
context = await self._create_stealth_context(p, cookies=session.cookies)
|
||||
page = await context.new_page()
|
||||
try:
|
||||
return await self._fetch_receipts(page, since)
|
||||
finally:
|
||||
if context.browser:
|
||||
await context.browser.close()
|
||||
|
||||
async def _fetch_receipts(self, page: Page, since: datetime | None) -> list[RawReceipt]:
|
||||
"""Fetch receipt list and details from Target order history.
|
||||
|
||||
Target's order history page has separate tabs for online and in-store
|
||||
purchases. We target the in-store tab which shows Circle-linked
|
||||
transactions.
|
||||
"""
|
||||
# Navigate to order history to establish context
|
||||
await page.goto(TARGET_ORDER_HISTORY, wait_until="networkidle")
|
||||
await self.human_delay(1500, 3000)
|
||||
|
||||
receipts: list[RawReceipt] = []
|
||||
|
||||
# Target order history API — filter for in-store purchases
|
||||
api_response = await page.request.get(
|
||||
TARGET_ORDER_API,
|
||||
params={"channel": "in_store", "limit": "50"},
|
||||
)
|
||||
if not api_response.ok:
|
||||
logger.warning(
|
||||
"Target order history request failed: %d %s",
|
||||
api_response.status,
|
||||
api_response.status_text,
|
||||
)
|
||||
return []
|
||||
|
||||
response = await api_response.json()
|
||||
if not isinstance(response, dict):
|
||||
logger.warning("Unexpected order history response type: %s", type(response))
|
||||
return []
|
||||
|
||||
# Target uses "orders" key for in-store purchase list
|
||||
orders = response.get("orders", response.get("transactions", []))
|
||||
if not isinstance(orders, list):
|
||||
logger.warning("No orders found in Target order history response")
|
||||
return []
|
||||
|
||||
logger.info("Found %d in-store orders in Target history", len(orders))
|
||||
|
||||
for order in orders:
|
||||
raw_id = order.get("orderId") or order.get("transactionId") or order.get("id") or ""
|
||||
order_id = str(raw_id)
|
||||
purchase_date = order.get(
|
||||
"purchaseDate",
|
||||
order.get("transactionDate", order.get("date", "")),
|
||||
)
|
||||
|
||||
# Filter by date if 'since' is provided
|
||||
if since and purchase_date:
|
||||
try:
|
||||
txn_dt = datetime.fromisoformat(purchase_date.replace("Z", "+00:00"))
|
||||
if txn_dt < since:
|
||||
continue
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
if not order_id:
|
||||
continue
|
||||
|
||||
await self.human_delay(1000, 2500)
|
||||
|
||||
# Fetch receipt detail
|
||||
detail = await self._fetch_receipt_detail(page, order_id)
|
||||
|
||||
raw_store = (
|
||||
order.get("storeNumber") or order.get("storeId") or order.get("locationId") or ""
|
||||
)
|
||||
store_number = str(raw_store)
|
||||
|
||||
receipts.append(
|
||||
RawReceipt(
|
||||
receipt_id=order_id,
|
||||
purchase_date=purchase_date,
|
||||
store_number=store_number,
|
||||
raw_data={**order, "detail": detail},
|
||||
source_url=f"{TARGET_RECEIPT_API}/{order_id}",
|
||||
)
|
||||
)
|
||||
|
||||
logger.info("Scraped %d receipts from Target", len(receipts))
|
||||
return receipts
|
||||
|
||||
async def _fetch_receipt_detail(self, page: Page, order_id: str) -> dict:
|
||||
"""Fetch detailed receipt data for a single Target order."""
|
||||
try:
|
||||
url = f"{TARGET_RECEIPT_API}/{order_id}"
|
||||
api_response = await page.request.get(url)
|
||||
if not api_response.ok:
|
||||
logger.warning(
|
||||
"Target receipt detail request failed for %s: %d",
|
||||
order_id,
|
||||
api_response.status,
|
||||
)
|
||||
return {}
|
||||
detail = await api_response.json()
|
||||
return detail if isinstance(detail, dict) else {}
|
||||
except Exception:
|
||||
logger.exception("Failed to fetch Target receipt detail for %s", order_id)
|
||||
return {}
|
||||
|
||||
def parse_receipt(self, raw: RawReceipt) -> dict:
|
||||
"""Parse raw Target receipt into structured purchase data."""
|
||||
from receiptwitness.parsers.target import parse_target_receipt
|
||||
|
||||
return parse_target_receipt(raw)
|
||||
@@ -0,0 +1 @@
|
||||
"""Session management — encrypted cookie storage and refresh logic."""
|
||||
@@ -0,0 +1,52 @@
|
||||
"""Fernet-based encryption for session cookies at rest.
|
||||
|
||||
Session data (cookies, tokens) is encrypted before writing to the database
|
||||
and decrypted only when needed for a scrape. The encryption key is provided
|
||||
via the RW_SESSION_ENCRYPTION_KEY environment variable — it is never stored
|
||||
in the database or logged.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
from cryptography.fernet import Fernet, InvalidToken
|
||||
|
||||
from receiptwitness.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_fernet() -> Fernet:
|
||||
"""Get a Fernet instance using the configured encryption key."""
|
||||
key = settings.session_encryption_key
|
||||
if not key:
|
||||
raise ValueError(
|
||||
"RW_SESSION_ENCRYPTION_KEY is not set. "
|
||||
"Generate one with: "
|
||||
"python -c 'from cryptography.fernet import Fernet; "
|
||||
"print(Fernet.generate_key().decode())'"
|
||||
)
|
||||
return Fernet(key.encode() if isinstance(key, str) else key)
|
||||
|
||||
|
||||
def encrypt_session_data(data: dict) -> str:
|
||||
"""Encrypt session data dict to a Fernet token string.
|
||||
|
||||
The data is JSON-serialized, then encrypted. The result is a
|
||||
URL-safe base64-encoded string suitable for storing in JSONB.
|
||||
"""
|
||||
f = _get_fernet()
|
||||
plaintext = json.dumps(data, default=str).encode("utf-8")
|
||||
return f.encrypt(plaintext).decode("utf-8")
|
||||
|
||||
|
||||
def decrypt_session_data(encrypted: str) -> dict:
|
||||
"""Decrypt a Fernet token string back to a session data dict."""
|
||||
f = _get_fernet()
|
||||
try:
|
||||
plaintext = f.decrypt(encrypted.encode("utf-8"))
|
||||
result: dict = json.loads(plaintext)
|
||||
return result
|
||||
except InvalidToken:
|
||||
logger.error("Failed to decrypt session data — invalid token or wrong key")
|
||||
raise
|
||||
@@ -0,0 +1,81 @@
|
||||
"""Session storage, retrieval, and refresh logic.
|
||||
|
||||
Manages the lifecycle of retailer session data:
|
||||
- Load encrypted session from DB
|
||||
- Check validity via scraper
|
||||
- Re-authenticate if expired
|
||||
- Save new session back (encrypted)
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import asdict
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from receiptwitness.scrapers.base import BaseScraper, SessionData
|
||||
from receiptwitness.session.encryption import decrypt_session_data, encrypt_session_data
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def session_from_db_record(session_data_encrypted: str | None) -> SessionData | None:
|
||||
"""Deserialize and decrypt a session from the database.
|
||||
|
||||
The session_data column in user_store_accounts stores the Fernet-encrypted
|
||||
JSON of the SessionData fields.
|
||||
"""
|
||||
if not session_data_encrypted:
|
||||
return None
|
||||
|
||||
try:
|
||||
data = decrypt_session_data(session_data_encrypted)
|
||||
return SessionData(
|
||||
cookies=data["cookies"],
|
||||
user_agent=data["user_agent"],
|
||||
created_at=datetime.fromisoformat(data["created_at"]),
|
||||
expires_at=(
|
||||
datetime.fromisoformat(data["expires_at"]) if data.get("expires_at") else None
|
||||
),
|
||||
extra=data.get("extra", {}),
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("Failed to load session from DB record")
|
||||
return None
|
||||
|
||||
|
||||
def session_to_db_value(session: SessionData) -> str:
|
||||
"""Serialize and encrypt a session for database storage."""
|
||||
data = asdict(session)
|
||||
# Convert datetime objects to ISO strings for JSON serialization
|
||||
data["created_at"] = session.created_at.isoformat()
|
||||
if session.expires_at:
|
||||
data["expires_at"] = session.expires_at.isoformat()
|
||||
return encrypt_session_data(data)
|
||||
|
||||
|
||||
async def get_valid_session(
|
||||
scraper: BaseScraper,
|
||||
session_data_encrypted: str | None,
|
||||
username: str,
|
||||
password: str,
|
||||
) -> tuple[SessionData, bool]:
|
||||
"""Get a valid session, re-authenticating if needed.
|
||||
|
||||
Returns:
|
||||
A tuple of (session, was_refreshed). If was_refreshed is True,
|
||||
the caller should persist the new session to the database.
|
||||
"""
|
||||
# Try existing session first
|
||||
existing = session_from_db_record(session_data_encrypted)
|
||||
if existing:
|
||||
if existing.expires_at and datetime.now(UTC) > existing.expires_at:
|
||||
logger.info("Session expired by timestamp, re-authenticating")
|
||||
elif await scraper.check_session(existing):
|
||||
logger.info("Existing session is valid")
|
||||
return existing, False
|
||||
else:
|
||||
logger.info("Session check failed, re-authenticating")
|
||||
|
||||
# Need to re-authenticate
|
||||
logger.info("Performing fresh login")
|
||||
new_session = await scraper.login(username, password)
|
||||
return new_session, True
|
||||
@@ -0,0 +1,29 @@
|
||||
"""Shared test fixtures."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
FIXTURES_DIR = Path(__file__).parent / "fixtures"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def meijer_receipt_data() -> dict:
|
||||
"""Load the sample Meijer receipt fixture."""
|
||||
with open(FIXTURES_DIR / "meijer_receipt.json") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def kroger_receipt_data() -> dict:
|
||||
"""Load the sample Kroger receipt fixture."""
|
||||
with open(FIXTURES_DIR / "kroger_receipt.json") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def target_receipt_data() -> dict:
|
||||
"""Load the sample Target receipt fixture."""
|
||||
with open(FIXTURES_DIR / "target_receipt.json") as f:
|
||||
return json.load(f)
|
||||
+131
@@ -0,0 +1,131 @@
|
||||
{
|
||||
"orderId": "KR-2026-0312-4471",
|
||||
"purchaseDate": "2026-03-12T16:45:00Z",
|
||||
"storeNumber": "00357",
|
||||
"divisionNumber": "014",
|
||||
"total": 94.17,
|
||||
"savings": 15.30,
|
||||
"detail": {
|
||||
"receiptId": "KR-2026-0312-4471",
|
||||
"items": [
|
||||
{
|
||||
"description": "KROGER WHOLE MILK GAL",
|
||||
"upc": "0001111041700",
|
||||
"quantity": 1,
|
||||
"basePrice": 3.99,
|
||||
"totalPrice": 3.99,
|
||||
"regularPrice": 4.29,
|
||||
"salePrice": 3.99,
|
||||
"couponAmount": 0.0,
|
||||
"plusCardSavings": 0.30,
|
||||
"department": "DAIRY"
|
||||
},
|
||||
{
|
||||
"description": "BANANAS",
|
||||
"upc": "0000000004011",
|
||||
"quantity": 1,
|
||||
"basePrice": 0.59,
|
||||
"totalPrice": 0.59,
|
||||
"regularPrice": 0.59,
|
||||
"salePrice": null,
|
||||
"couponAmount": null,
|
||||
"plusCardSavings": null,
|
||||
"department": "PRODUCE"
|
||||
},
|
||||
{
|
||||
"description": "SIMPLE TRUTH ORG EGGS 12CT",
|
||||
"upc": "0001111087840",
|
||||
"quantity": 2,
|
||||
"basePrice": 5.49,
|
||||
"totalPrice": 10.98,
|
||||
"regularPrice": 5.99,
|
||||
"salePrice": 5.49,
|
||||
"couponAmount": 0.0,
|
||||
"plusCardSavings": 1.00,
|
||||
"department": "DAIRY"
|
||||
},
|
||||
{
|
||||
"description": "KROGER DELI TURKEY BREAST",
|
||||
"upc": null,
|
||||
"quantity": 0.68,
|
||||
"basePrice": 9.99,
|
||||
"totalPrice": 6.79,
|
||||
"regularPrice": 9.99,
|
||||
"salePrice": null,
|
||||
"weight": 0.68,
|
||||
"weightUom": "LB",
|
||||
"department": "DELI"
|
||||
},
|
||||
{
|
||||
"description": "TIDE PODS 42CT",
|
||||
"upc": "0003700096223",
|
||||
"quantity": 1,
|
||||
"basePrice": 13.99,
|
||||
"totalPrice": 13.99,
|
||||
"regularPrice": 15.99,
|
||||
"salePrice": 13.99,
|
||||
"couponAmount": 2.00,
|
||||
"plusCardSavings": 0.0,
|
||||
"department": "HOUSEHOLD"
|
||||
},
|
||||
{
|
||||
"description": "VOIDED DORITOS NACHO",
|
||||
"upc": "0002840032505",
|
||||
"quantity": 1,
|
||||
"basePrice": 4.79,
|
||||
"totalPrice": 4.79,
|
||||
"voided": true,
|
||||
"department": "SNACKS"
|
||||
},
|
||||
{
|
||||
"description": "RETURNED GATORADE 8PK",
|
||||
"upc": "0005200012505",
|
||||
"quantity": 1,
|
||||
"basePrice": 7.99,
|
||||
"totalPrice": 7.99,
|
||||
"status": "RETURNED",
|
||||
"department": "BEVERAGES"
|
||||
},
|
||||
{
|
||||
"description": "KROGER SHARP CHEDDAR 8OZ",
|
||||
"upc": "0001111060930",
|
||||
"quantity": 1,
|
||||
"basePrice": 3.49,
|
||||
"totalPrice": 3.49,
|
||||
"regularPrice": 3.49,
|
||||
"salePrice": null,
|
||||
"couponAmount": null,
|
||||
"plusCardSavings": null,
|
||||
"department": "DAIRY"
|
||||
},
|
||||
{
|
||||
"description": "PRIVATE SELECTION PASTA",
|
||||
"upc": "0001111085612",
|
||||
"quantity": 3,
|
||||
"basePrice": 2.49,
|
||||
"totalPrice": 7.47,
|
||||
"regularPrice": 2.99,
|
||||
"salePrice": 2.49,
|
||||
"couponAmount": 0.0,
|
||||
"plusCardSavings": 1.50,
|
||||
"department": "GROCERY"
|
||||
},
|
||||
{
|
||||
"description": "KROGER GROUND BEEF 80/20",
|
||||
"upc": null,
|
||||
"quantity": 1.23,
|
||||
"basePrice": 5.99,
|
||||
"totalPrice": 7.37,
|
||||
"regularPrice": 6.99,
|
||||
"salePrice": 5.99,
|
||||
"weight": 1.23,
|
||||
"weightUom": "LB",
|
||||
"department": "MEAT"
|
||||
}
|
||||
],
|
||||
"subtotal": 78.47,
|
||||
"tax": 5.50,
|
||||
"total": 94.17,
|
||||
"totalSavings": 15.30
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,85 @@
|
||||
{
|
||||
"transactionId": "TXN-2026-0310-001",
|
||||
"transactionDate": "2026-03-10T14:30:00Z",
|
||||
"storeNumber": "42",
|
||||
"total": 87.42,
|
||||
"savings": 12.50,
|
||||
"detail": {
|
||||
"receiptId": "TXN-2026-0310-001",
|
||||
"items": [
|
||||
{
|
||||
"description": "ORGANIC BANANAS",
|
||||
"upc": "0000000004011",
|
||||
"quantity": 1,
|
||||
"price": 0.69,
|
||||
"extendedPrice": 0.69,
|
||||
"regularPrice": 0.79,
|
||||
"salePrice": 0.69,
|
||||
"couponDiscount": 0.0,
|
||||
"mperksDiscount": 0.10,
|
||||
"category": "PRODUCE"
|
||||
},
|
||||
{
|
||||
"description": "MEIJER 2% MILK GAL",
|
||||
"upc": "0041250000123",
|
||||
"quantity": 2,
|
||||
"price": 3.49,
|
||||
"extendedPrice": 6.98,
|
||||
"regularPrice": 3.79,
|
||||
"salePrice": 3.49,
|
||||
"couponDiscount": 0.0,
|
||||
"mperksDiscount": 0.0,
|
||||
"category": "DAIRY"
|
||||
},
|
||||
{
|
||||
"description": "CHEERIOS 18OZ",
|
||||
"upc": "0016000275614",
|
||||
"quantity": 1,
|
||||
"price": 4.99,
|
||||
"extendedPrice": 4.99,
|
||||
"regularPrice": 5.49,
|
||||
"salePrice": null,
|
||||
"couponDiscount": 0.50,
|
||||
"mperksDiscount": 0.0,
|
||||
"category": "CEREAL"
|
||||
},
|
||||
{
|
||||
"description": "WEIGHTED DELI TURKEY",
|
||||
"upc": null,
|
||||
"quantity": 0.75,
|
||||
"price": 8.99,
|
||||
"extendedPrice": 6.74,
|
||||
"regularPrice": 8.99,
|
||||
"salePrice": null,
|
||||
"couponDiscount": null,
|
||||
"mperksDiscount": null,
|
||||
"category": "DELI"
|
||||
},
|
||||
{
|
||||
"description": "VOIDED SODA 12PK",
|
||||
"upc": "0004900005678",
|
||||
"quantity": 1,
|
||||
"price": 5.99,
|
||||
"extendedPrice": 5.99,
|
||||
"voided": true,
|
||||
"category": "BEVERAGES"
|
||||
},
|
||||
{
|
||||
"description": "MEIJER PAPER TOWELS 6PK",
|
||||
"upc": "0041250099001",
|
||||
"quantity": 1,
|
||||
"price": 7.99,
|
||||
"extendedPrice": 7.99,
|
||||
"regularPrice": 9.99,
|
||||
"salePrice": 7.99,
|
||||
"couponDiscount": 1.00,
|
||||
"mperksDiscount": 1.00,
|
||||
"category": "HOUSEHOLD"
|
||||
}
|
||||
],
|
||||
"subtotal": 74.92,
|
||||
"tax": 5.24,
|
||||
"total": 87.42,
|
||||
"totalSavings": 12.50
|
||||
}
|
||||
}
|
||||
+140
@@ -0,0 +1,140 @@
|
||||
{
|
||||
"orderId": "TGT-2026-0315-7890",
|
||||
"purchaseDate": "2026-03-15T11:23:00Z",
|
||||
"storeNumber": "2774",
|
||||
"total": 83.21,
|
||||
"savings": 11.45,
|
||||
"detail": {
|
||||
"receiptId": "TGT-2026-0315-7890",
|
||||
"items": [
|
||||
{
|
||||
"description": "GOOD & GATHER WHOLE MILK GAL",
|
||||
"tcin": "14767459",
|
||||
"upc": "0085239100123",
|
||||
"quantity": 1,
|
||||
"unitPrice": 3.89,
|
||||
"totalPrice": 3.89,
|
||||
"regularPrice": 4.19,
|
||||
"circlePrice": 3.89,
|
||||
"couponDiscount": 0.0,
|
||||
"circleRewardsDiscount": 0.30,
|
||||
"promoDescription": "Circle offer: Save 30c",
|
||||
"department": "GROCERY"
|
||||
},
|
||||
{
|
||||
"description": "BANANAS",
|
||||
"upc": "0000000004011",
|
||||
"quantity": 1,
|
||||
"unitPrice": 0.25,
|
||||
"totalPrice": 0.25,
|
||||
"regularPrice": 0.25,
|
||||
"circlePrice": null,
|
||||
"couponDiscount": null,
|
||||
"circleRewardsDiscount": null,
|
||||
"department": "PRODUCE"
|
||||
},
|
||||
{
|
||||
"description": "MARKET PANTRY LARGE EGGS 18CT",
|
||||
"tcin": "13292174",
|
||||
"upc": "0085239206753",
|
||||
"quantity": 2,
|
||||
"unitPrice": 4.99,
|
||||
"totalPrice": 9.98,
|
||||
"regularPrice": 5.49,
|
||||
"circlePrice": 4.99,
|
||||
"couponDiscount": 0.0,
|
||||
"circleRewardsDiscount": 1.00,
|
||||
"promoDescription": "Circle offer: 2 for $10",
|
||||
"department": "GROCERY"
|
||||
},
|
||||
{
|
||||
"description": "DELI SLICED TURKEY BREAST",
|
||||
"upc": null,
|
||||
"quantity": 0.72,
|
||||
"unitPrice": 10.99,
|
||||
"totalPrice": 7.91,
|
||||
"regularPrice": 10.99,
|
||||
"weight": 0.72,
|
||||
"weightUom": "LB",
|
||||
"department": "DELI"
|
||||
},
|
||||
{
|
||||
"description": "TIDE PODS 42CT",
|
||||
"tcin": "76150253",
|
||||
"upc": "0003700096223",
|
||||
"quantity": 1,
|
||||
"unitPrice": 13.49,
|
||||
"totalPrice": 13.49,
|
||||
"regularPrice": 15.99,
|
||||
"circlePrice": 13.49,
|
||||
"couponDiscount": 2.50,
|
||||
"circleRewardsDiscount": 0.0,
|
||||
"promoDescription": "Circle offer + mfr coupon",
|
||||
"department": "HOUSEHOLD"
|
||||
},
|
||||
{
|
||||
"description": "UP&UP PAPER TOWELS 6PK",
|
||||
"tcin": "52493117",
|
||||
"upc": "0085239401567",
|
||||
"quantity": 1,
|
||||
"unitPrice": 8.99,
|
||||
"totalPrice": 8.99,
|
||||
"regularPrice": 8.99,
|
||||
"circlePrice": null,
|
||||
"couponDiscount": null,
|
||||
"circleRewardsDiscount": null,
|
||||
"department": "HOUSEHOLD"
|
||||
},
|
||||
{
|
||||
"description": "VOIDED COCA-COLA 12PK",
|
||||
"upc": "0004900002521",
|
||||
"quantity": 1,
|
||||
"unitPrice": 7.49,
|
||||
"totalPrice": 7.49,
|
||||
"voided": true,
|
||||
"department": "BEVERAGES"
|
||||
},
|
||||
{
|
||||
"description": "RETURNED OLAY MOISTURIZER",
|
||||
"upc": "0007560402118",
|
||||
"quantity": 1,
|
||||
"unitPrice": 12.99,
|
||||
"totalPrice": 12.99,
|
||||
"status": "RETURNED",
|
||||
"department": "BEAUTY"
|
||||
},
|
||||
{
|
||||
"description": "FAVOURITE DAY TRAIL MIX",
|
||||
"tcin": "83921045",
|
||||
"dpci": "271-09-0142",
|
||||
"upc": "0085239700891",
|
||||
"quantity": 1,
|
||||
"unitPrice": 5.49,
|
||||
"totalPrice": 5.49,
|
||||
"regularPrice": 5.49,
|
||||
"circlePrice": null,
|
||||
"couponDiscount": null,
|
||||
"circleRewardsDiscount": null,
|
||||
"department": "SNACKS"
|
||||
},
|
||||
{
|
||||
"description": "BOGO GOOD & GATHER PASTA",
|
||||
"tcin": "78114326",
|
||||
"upc": "0085239300456",
|
||||
"quantity": 2,
|
||||
"unitPrice": 1.79,
|
||||
"totalPrice": 1.79,
|
||||
"regularPrice": 1.79,
|
||||
"circlePrice": 0.895,
|
||||
"couponDiscount": 0.0,
|
||||
"circleRewardsDiscount": 1.79,
|
||||
"promoDescription": "Buy 1 get 1 free",
|
||||
"department": "GROCERY"
|
||||
}
|
||||
],
|
||||
"subtotal": 78.32,
|
||||
"tax": 4.89,
|
||||
"total": 83.21,
|
||||
"totalSavings": 11.45
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,399 @@
|
||||
"""Tests for the Kroger receipt parser."""
|
||||
|
||||
from decimal import Decimal
|
||||
|
||||
from receiptwitness.parsers.kroger import _parse_item, _to_decimal, parse_kroger_receipt
|
||||
from receiptwitness.scrapers.base import RawReceipt
|
||||
|
||||
|
||||
class TestToDecimal:
|
||||
def test_from_int(self):
|
||||
assert _to_decimal(42) == Decimal("42")
|
||||
|
||||
def test_from_float(self):
|
||||
assert _to_decimal(3.99) == Decimal("3.99")
|
||||
|
||||
def test_from_string(self):
|
||||
assert _to_decimal("7.49") == Decimal("7.49")
|
||||
|
||||
def test_none_returns_default(self):
|
||||
assert _to_decimal(None) == Decimal("0")
|
||||
|
||||
def test_none_custom_default(self):
|
||||
assert _to_decimal(None, "1") == Decimal("1")
|
||||
|
||||
def test_invalid_string_returns_default(self):
|
||||
assert _to_decimal("not-a-number") == Decimal("0")
|
||||
|
||||
def test_empty_string_returns_default(self):
|
||||
assert _to_decimal("") == Decimal("0")
|
||||
|
||||
|
||||
class TestParseItem:
|
||||
def test_standard_item(self):
|
||||
raw = {
|
||||
"description": "KROGER WHOLE MILK GAL",
|
||||
"upc": "0001111041700",
|
||||
"quantity": 1,
|
||||
"basePrice": 3.99,
|
||||
"totalPrice": 3.99,
|
||||
"regularPrice": 4.29,
|
||||
"salePrice": 3.99,
|
||||
"couponAmount": 0.0,
|
||||
"plusCardSavings": 0.30,
|
||||
"department": "DAIRY",
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "KROGER WHOLE MILK GAL"
|
||||
assert result["upc"] == "1111041700"
|
||||
assert result["quantity"] == Decimal("1")
|
||||
assert result["unit_price"] == Decimal("3.99")
|
||||
assert result["extended_price"] == Decimal("3.99")
|
||||
assert result["regular_price"] == Decimal("4.29")
|
||||
assert result["sale_price"] == Decimal("3.99")
|
||||
assert result["loyalty_discount"] == Decimal("0.30")
|
||||
assert result["category_raw"] == "DAIRY"
|
||||
|
||||
def test_weighted_item(self):
|
||||
raw = {
|
||||
"description": "KROGER DELI TURKEY BREAST",
|
||||
"quantity": 0.68,
|
||||
"basePrice": 9.99,
|
||||
"totalPrice": 6.79,
|
||||
"weight": 0.68,
|
||||
"weightUom": "LB",
|
||||
"department": "DELI",
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "KROGER DELI TURKEY BREAST"
|
||||
assert result["upc"] is None
|
||||
assert result["quantity"] == Decimal("0.68")
|
||||
assert result["unit_price"] == Decimal("9.99")
|
||||
assert result["extended_price"] == Decimal("6.79")
|
||||
|
||||
def test_missing_extended_price_computed(self):
|
||||
raw = {
|
||||
"description": "TEST ITEM",
|
||||
"quantity": 3,
|
||||
"basePrice": 2.49,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["extended_price"] == Decimal("2.49") * Decimal("3")
|
||||
|
||||
def test_item_with_coupon(self):
|
||||
raw = {
|
||||
"description": "TIDE PODS 42CT",
|
||||
"upc": "0003700096223",
|
||||
"quantity": 1,
|
||||
"basePrice": 13.99,
|
||||
"totalPrice": 13.99,
|
||||
"couponAmount": 2.00,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["coupon_discount"] == Decimal("2.00")
|
||||
|
||||
def test_missing_description_fallback(self):
|
||||
raw = {"basePrice": 1.00, "totalPrice": 1.00}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "UNKNOWN ITEM"
|
||||
|
||||
def test_alternative_field_names_product_name(self):
|
||||
raw = {
|
||||
"productName": "ALT NAME ITEM",
|
||||
"unitPrice": 5.00,
|
||||
"extendedAmount": 5.00,
|
||||
"qty": 1,
|
||||
"krogerProductId": "123456789",
|
||||
"category": "GROCERY",
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "ALT NAME ITEM"
|
||||
assert result["unit_price"] == Decimal("5.00")
|
||||
assert result["extended_price"] == Decimal("5.00")
|
||||
assert result["upc"] == "123456789"
|
||||
assert result["category_raw"] == "GROCERY"
|
||||
|
||||
def test_item_description_field_name(self):
|
||||
raw = {
|
||||
"itemDescription": "ITEM DESC FIELD",
|
||||
"price": 3.00,
|
||||
"lineTotal": 3.00,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "ITEM DESC FIELD"
|
||||
assert result["unit_price"] == Decimal("3.00")
|
||||
assert result["extended_price"] == Decimal("3.00")
|
||||
|
||||
def test_null_optional_fields(self):
|
||||
raw = {
|
||||
"description": "BANANAS",
|
||||
"upc": "0000000004011",
|
||||
"quantity": 1,
|
||||
"basePrice": 0.59,
|
||||
"totalPrice": 0.59,
|
||||
"salePrice": None,
|
||||
"couponAmount": None,
|
||||
"plusCardSavings": None,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["sale_price"] is None
|
||||
assert result["coupon_discount"] is None
|
||||
assert result["loyalty_discount"] is None
|
||||
|
||||
def test_upc_leading_zeros_stripped(self):
|
||||
raw = {
|
||||
"description": "TEST",
|
||||
"upc": "0000000004011",
|
||||
"basePrice": 1.00,
|
||||
"totalPrice": 1.00,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["upc"] == "4011"
|
||||
|
||||
def test_upc_from_kroger_product_id(self):
|
||||
raw = {
|
||||
"description": "TEST",
|
||||
"krogerProductId": "987654321",
|
||||
"basePrice": 1.00,
|
||||
"totalPrice": 1.00,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["upc"] == "987654321"
|
||||
|
||||
def test_description_whitespace_stripped(self):
|
||||
raw = {
|
||||
"description": " EXTRA SPACES ",
|
||||
"basePrice": 1.00,
|
||||
"totalPrice": 1.00,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "EXTRA SPACES"
|
||||
|
||||
def test_promo_price_field(self):
|
||||
raw = {
|
||||
"description": "PROMO ITEM",
|
||||
"promoPrice": 2.99,
|
||||
"originalPrice": 4.99,
|
||||
"basePrice": 2.99,
|
||||
"totalPrice": 2.99,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["sale_price"] == Decimal("2.99")
|
||||
assert result["regular_price"] == Decimal("4.99")
|
||||
|
||||
def test_loyalty_discount_from_fuel_points(self):
|
||||
raw = {
|
||||
"description": "FUEL DISC ITEM",
|
||||
"fuelPointsDiscount": 0.50,
|
||||
"basePrice": 3.00,
|
||||
"totalPrice": 3.00,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["loyalty_discount"] == Decimal("0.50")
|
||||
|
||||
def test_multi_quantity_item(self):
|
||||
raw = {
|
||||
"description": "PRIVATE SELECTION PASTA",
|
||||
"quantity": 3,
|
||||
"basePrice": 2.49,
|
||||
"totalPrice": 7.47,
|
||||
"department": "GROCERY",
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["quantity"] == Decimal("3")
|
||||
assert result["unit_price"] == Decimal("2.49")
|
||||
assert result["extended_price"] == Decimal("7.47")
|
||||
|
||||
def test_aisle_as_category(self):
|
||||
raw = {
|
||||
"description": "AISLE ITEM",
|
||||
"aisle": "FROZEN FOODS",
|
||||
"basePrice": 4.00,
|
||||
"totalPrice": 4.00,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["category_raw"] == "FROZEN FOODS"
|
||||
|
||||
|
||||
class TestParseKrogerReceipt:
|
||||
def test_full_receipt(self, kroger_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-2026-0312-4471",
|
||||
purchase_date="2026-03-12T16:45:00Z",
|
||||
store_number="00357",
|
||||
raw_data=kroger_receipt_data,
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
|
||||
assert result["receipt_id"] == "KR-2026-0312-4471"
|
||||
assert result["purchase_date"] == "2026-03-12T16:45:00Z"
|
||||
assert result["total"] == Decimal("94.17")
|
||||
assert result["subtotal"] == Decimal("78.47")
|
||||
assert result["tax"] == Decimal("5.50")
|
||||
assert result["savings_total"] == Decimal("15.30")
|
||||
|
||||
# Should have 8 items (voided + returned items excluded)
|
||||
assert len(result["items"]) == 8
|
||||
|
||||
# Verify first item
|
||||
milk = result["items"][0]
|
||||
assert milk["product_name_raw"] == "KROGER WHOLE MILK GAL"
|
||||
assert milk["upc"] == "1111041700"
|
||||
|
||||
def test_voided_items_excluded(self, kroger_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-2026-0312-4471",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data=kroger_receipt_data,
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
|
||||
item_names = [i["product_name_raw"] for i in result["items"]]
|
||||
assert "VOIDED DORITOS NACHO" not in item_names
|
||||
|
||||
def test_returned_items_excluded(self, kroger_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-2026-0312-4471",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data=kroger_receipt_data,
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
|
||||
item_names = [i["product_name_raw"] for i in result["items"]]
|
||||
assert "RETURNED GATORADE 8PK" not in item_names
|
||||
|
||||
def test_return_flag_items_excluded(self):
|
||||
data = {
|
||||
"detail": {
|
||||
"items": [
|
||||
{
|
||||
"description": "NORMAL ITEM",
|
||||
"basePrice": 5.00,
|
||||
"totalPrice": 5.00,
|
||||
},
|
||||
{
|
||||
"description": "RETURNED VIA FLAG",
|
||||
"basePrice": 3.00,
|
||||
"totalPrice": 3.00,
|
||||
"returnFlag": True,
|
||||
},
|
||||
{
|
||||
"description": "IS RETURN ITEM",
|
||||
"basePrice": 2.00,
|
||||
"totalPrice": 2.00,
|
||||
"isReturn": True,
|
||||
},
|
||||
],
|
||||
"total": 5.00,
|
||||
}
|
||||
}
|
||||
raw = RawReceipt(
|
||||
receipt_id="RET-001",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data=data,
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert len(result["items"]) == 1
|
||||
assert result["items"][0]["product_name_raw"] == "NORMAL ITEM"
|
||||
|
||||
def test_empty_receipt(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="EMPTY-001",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={"detail": {"items": [], "total": 0}},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["items"] == []
|
||||
assert result["total"] == Decimal("0")
|
||||
|
||||
def test_receipt_with_no_detail(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="NO-DETAIL-001",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={"total": 50.00},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["items"] == []
|
||||
assert result["total"] == Decimal("50.00")
|
||||
|
||||
def test_raw_data_preserved(self, kroger_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-2026-0312-4471",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data=kroger_receipt_data,
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["raw_data"] is kroger_receipt_data
|
||||
|
||||
def test_alternative_total_field_names(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="ALT-001",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={
|
||||
"orderTotal": 42.00,
|
||||
"subTotal": 35.00,
|
||||
"salesTax": 3.50,
|
||||
"youSaved": 5.00,
|
||||
"detail": {"items": []},
|
||||
},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["total"] == Decimal("42.00")
|
||||
assert result["subtotal"] == Decimal("35.00")
|
||||
assert result["tax"] == Decimal("3.50")
|
||||
assert result["savings_total"] == Decimal("5.00")
|
||||
|
||||
def test_receipt_items_alternative_key(self):
|
||||
data = {
|
||||
"detail": {
|
||||
"receiptItems": [
|
||||
{
|
||||
"description": "ALT KEY ITEM",
|
||||
"basePrice": 3.00,
|
||||
"totalPrice": 3.00,
|
||||
}
|
||||
],
|
||||
"total": 3.00,
|
||||
}
|
||||
}
|
||||
raw = RawReceipt(
|
||||
receipt_id="ALT-KEY-001",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data=data,
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert len(result["items"]) == 1
|
||||
assert result["items"][0]["product_name_raw"] == "ALT KEY ITEM"
|
||||
|
||||
def test_source_url_preserved(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="URL-001",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={"detail": {"items": [], "total": 0}},
|
||||
source_url="https://www.kroger.com/atlas/v1/receipt/api?orderId=URL-001",
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["source_url"] == "https://www.kroger.com/atlas/v1/receipt/api?orderId=URL-001"
|
||||
|
||||
def test_weighted_items_in_full_receipt(self, kroger_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-2026-0312-4471",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data=kroger_receipt_data,
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
|
||||
# Find the weighted turkey item
|
||||
turkey = next(i for i in result["items"] if "TURKEY" in i["product_name_raw"])
|
||||
assert turkey["quantity"] == Decimal("0.68")
|
||||
assert turkey["unit_price"] == Decimal("9.99")
|
||||
assert turkey["extended_price"] == Decimal("6.79")
|
||||
|
||||
def test_grand_total_field(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="GT-001",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={"grandTotal": 99.99, "detail": {"items": []}},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["total"] == Decimal("99.99")
|
||||
@@ -0,0 +1,174 @@
|
||||
"""Tests for the Meijer receipt parser."""
|
||||
|
||||
from decimal import Decimal
|
||||
|
||||
from receiptwitness.parsers.meijer import _parse_item, _to_decimal, parse_meijer_receipt
|
||||
from receiptwitness.scrapers.base import RawReceipt
|
||||
|
||||
|
||||
class TestToDecimal:
|
||||
def test_from_int(self):
|
||||
assert _to_decimal(42) == Decimal("42")
|
||||
|
||||
def test_from_float(self):
|
||||
assert _to_decimal(3.49) == Decimal("3.49")
|
||||
|
||||
def test_from_string(self):
|
||||
assert _to_decimal("7.99") == Decimal("7.99")
|
||||
|
||||
def test_none_returns_default(self):
|
||||
assert _to_decimal(None) == Decimal("0")
|
||||
|
||||
def test_none_custom_default(self):
|
||||
assert _to_decimal(None, "1") == Decimal("1")
|
||||
|
||||
def test_invalid_string_returns_default(self):
|
||||
assert _to_decimal("not-a-number") == Decimal("0")
|
||||
|
||||
|
||||
class TestParseItem:
|
||||
def test_standard_item(self):
|
||||
raw = {
|
||||
"description": "ORGANIC BANANAS",
|
||||
"upc": "0000000004011",
|
||||
"quantity": 1,
|
||||
"price": 0.69,
|
||||
"extendedPrice": 0.69,
|
||||
"regularPrice": 0.79,
|
||||
"salePrice": 0.69,
|
||||
"couponDiscount": 0.0,
|
||||
"mperksDiscount": 0.10,
|
||||
"category": "PRODUCE",
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "ORGANIC BANANAS"
|
||||
assert result["upc"] == "4011"
|
||||
assert result["quantity"] == Decimal("1")
|
||||
assert result["unit_price"] == Decimal("0.69")
|
||||
assert result["extended_price"] == Decimal("0.69")
|
||||
assert result["regular_price"] == Decimal("0.79")
|
||||
assert result["sale_price"] == Decimal("0.69")
|
||||
assert result["loyalty_discount"] == Decimal("0.10")
|
||||
assert result["category_raw"] == "PRODUCE"
|
||||
|
||||
def test_weighted_item(self):
|
||||
raw = {
|
||||
"description": "WEIGHTED DELI TURKEY",
|
||||
"quantity": 0.75,
|
||||
"price": 8.99,
|
||||
"extendedPrice": 6.74,
|
||||
"category": "DELI",
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "WEIGHTED DELI TURKEY"
|
||||
assert result["upc"] is None
|
||||
assert result["quantity"] == Decimal("0.75")
|
||||
assert result["unit_price"] == Decimal("8.99")
|
||||
assert result["extended_price"] == Decimal("6.74")
|
||||
|
||||
def test_missing_extended_price_computed(self):
|
||||
raw = {
|
||||
"description": "TEST ITEM",
|
||||
"quantity": 3,
|
||||
"price": 2.50,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["extended_price"] == Decimal("2.50") * Decimal("3")
|
||||
|
||||
def test_item_with_coupon_discount(self):
|
||||
raw = {
|
||||
"description": "CHEERIOS 18OZ",
|
||||
"upc": "0016000275614",
|
||||
"quantity": 1,
|
||||
"price": 4.99,
|
||||
"extendedPrice": 4.99,
|
||||
"couponDiscount": 0.50,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["coupon_discount"] == Decimal("0.50")
|
||||
|
||||
def test_missing_description_fallback(self):
|
||||
raw = {"price": 1.00, "extendedPrice": 1.00}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "UNKNOWN ITEM"
|
||||
|
||||
def test_alternative_field_names(self):
|
||||
raw = {
|
||||
"itemDescription": "ALT NAME ITEM",
|
||||
"unitPrice": 5.00,
|
||||
"totalPrice": 5.00,
|
||||
"qty": 1,
|
||||
"UPC": "123456789",
|
||||
"departmentDescription": "GROCERY",
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "ALT NAME ITEM"
|
||||
assert result["unit_price"] == Decimal("5.00")
|
||||
assert result["upc"] == "123456789"
|
||||
assert result["category_raw"] == "GROCERY"
|
||||
|
||||
|
||||
class TestParseMeijerReceipt:
|
||||
def test_full_receipt(self, meijer_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="TXN-2026-0310-001",
|
||||
purchase_date="2026-03-10T14:30:00Z",
|
||||
store_number="42",
|
||||
raw_data=meijer_receipt_data,
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
|
||||
assert result["receipt_id"] == "TXN-2026-0310-001"
|
||||
assert result["purchase_date"] == "2026-03-10T14:30:00Z"
|
||||
assert result["total"] == Decimal("87.42")
|
||||
assert result["subtotal"] == Decimal("74.92")
|
||||
assert result["tax"] == Decimal("5.24")
|
||||
assert result["savings_total"] == Decimal("12.50")
|
||||
|
||||
# Should have 5 items (voided item excluded)
|
||||
assert len(result["items"]) == 5
|
||||
|
||||
# Verify first item
|
||||
bananas = result["items"][0]
|
||||
assert bananas["product_name_raw"] == "ORGANIC BANANAS"
|
||||
assert bananas["upc"] == "4011"
|
||||
|
||||
def test_voided_items_excluded(self, meijer_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="TXN-2026-0310-001",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data=meijer_receipt_data,
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
|
||||
item_names = [i["product_name_raw"] for i in result["items"]]
|
||||
assert "VOIDED SODA 12PK" not in item_names
|
||||
|
||||
def test_empty_receipt(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="EMPTY-001",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={"detail": {"items": [], "total": 0}},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["items"] == []
|
||||
assert result["total"] == Decimal("0")
|
||||
|
||||
def test_receipt_with_no_detail(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="NO-DETAIL-001",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={"total": 50.00},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["items"] == []
|
||||
assert result["total"] == Decimal("50.00")
|
||||
|
||||
def test_raw_data_preserved(self, meijer_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="TXN-2026-0310-001",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data=meijer_receipt_data,
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["raw_data"] is meijer_receipt_data
|
||||
@@ -0,0 +1,471 @@
|
||||
"""Tests for the Target receipt parser."""
|
||||
|
||||
from decimal import Decimal
|
||||
|
||||
from receiptwitness.parsers.target import _parse_item, _to_decimal, parse_target_receipt
|
||||
from receiptwitness.scrapers.base import RawReceipt
|
||||
|
||||
|
||||
class TestToDecimal:
|
||||
def test_from_int(self):
|
||||
assert _to_decimal(42) == Decimal("42")
|
||||
|
||||
def test_from_float(self):
|
||||
assert _to_decimal(3.89) == Decimal("3.89")
|
||||
|
||||
def test_from_string(self):
|
||||
assert _to_decimal("8.99") == Decimal("8.99")
|
||||
|
||||
def test_none_returns_default(self):
|
||||
assert _to_decimal(None) == Decimal("0")
|
||||
|
||||
def test_none_custom_default(self):
|
||||
assert _to_decimal(None, "1") == Decimal("1")
|
||||
|
||||
def test_invalid_string_returns_default(self):
|
||||
assert _to_decimal("not-a-number") == Decimal("0")
|
||||
|
||||
def test_empty_string_returns_default(self):
|
||||
assert _to_decimal("") == Decimal("0")
|
||||
|
||||
|
||||
class TestParseItem:
|
||||
def test_standard_item(self):
|
||||
raw = {
|
||||
"description": "GOOD & GATHER WHOLE MILK GAL",
|
||||
"tcin": "14767459",
|
||||
"upc": "0085239100123",
|
||||
"quantity": 1,
|
||||
"unitPrice": 3.89,
|
||||
"totalPrice": 3.89,
|
||||
"regularPrice": 4.19,
|
||||
"circlePrice": 3.89,
|
||||
"couponDiscount": 0.0,
|
||||
"circleRewardsDiscount": 0.30,
|
||||
"department": "GROCERY",
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "GOOD & GATHER WHOLE MILK GAL"
|
||||
assert result["upc"] == "85239100123"
|
||||
assert result["quantity"] == Decimal("1")
|
||||
assert result["unit_price"] == Decimal("3.89")
|
||||
assert result["extended_price"] == Decimal("3.89")
|
||||
assert result["regular_price"] == Decimal("4.19")
|
||||
assert result["sale_price"] == Decimal("3.89")
|
||||
assert result["loyalty_discount"] == Decimal("0.30")
|
||||
assert result["category_raw"] == "GROCERY"
|
||||
|
||||
def test_weighted_item(self):
|
||||
raw = {
|
||||
"description": "DELI SLICED TURKEY BREAST",
|
||||
"quantity": 0.72,
|
||||
"unitPrice": 10.99,
|
||||
"totalPrice": 7.91,
|
||||
"weight": 0.72,
|
||||
"weightUom": "LB",
|
||||
"department": "DELI",
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "DELI SLICED TURKEY BREAST"
|
||||
assert result["upc"] is None
|
||||
assert result["quantity"] == Decimal("0.72")
|
||||
assert result["unit_price"] == Decimal("10.99")
|
||||
assert result["extended_price"] == Decimal("7.91")
|
||||
|
||||
def test_missing_extended_price_computed(self):
|
||||
raw = {
|
||||
"description": "TEST ITEM",
|
||||
"quantity": 3,
|
||||
"unitPrice": 2.49,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["extended_price"] == Decimal("2.49") * Decimal("3")
|
||||
|
||||
def test_item_with_coupon(self):
|
||||
raw = {
|
||||
"description": "TIDE PODS 42CT",
|
||||
"upc": "0003700096223",
|
||||
"quantity": 1,
|
||||
"unitPrice": 13.49,
|
||||
"totalPrice": 13.49,
|
||||
"couponDiscount": 2.50,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["coupon_discount"] == Decimal("2.50")
|
||||
|
||||
def test_missing_description_fallback(self):
|
||||
raw = {"unitPrice": 1.00, "totalPrice": 1.00}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "UNKNOWN ITEM"
|
||||
|
||||
def test_alternative_field_names(self):
|
||||
raw = {
|
||||
"productName": "ALT NAME ITEM",
|
||||
"price": 5.00,
|
||||
"extendedPrice": 5.00,
|
||||
"qty": 1,
|
||||
"UPC": "123456789",
|
||||
"category": "FROZEN",
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "ALT NAME ITEM"
|
||||
assert result["unit_price"] == Decimal("5.00")
|
||||
assert result["extended_price"] == Decimal("5.00")
|
||||
assert result["upc"] == "123456789"
|
||||
assert result["category_raw"] == "FROZEN"
|
||||
|
||||
def test_item_description_field_name(self):
|
||||
raw = {
|
||||
"itemDescription": "ITEM DESC FIELD",
|
||||
"price": 3.00,
|
||||
"lineTotal": 3.00,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "ITEM DESC FIELD"
|
||||
assert result["unit_price"] == Decimal("3.00")
|
||||
assert result["extended_price"] == Decimal("3.00")
|
||||
|
||||
def test_null_optional_fields(self):
|
||||
raw = {
|
||||
"description": "BANANAS",
|
||||
"upc": "0000000004011",
|
||||
"quantity": 1,
|
||||
"unitPrice": 0.25,
|
||||
"totalPrice": 0.25,
|
||||
"circlePrice": None,
|
||||
"couponDiscount": None,
|
||||
"circleRewardsDiscount": None,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["sale_price"] is None
|
||||
assert result["coupon_discount"] is None
|
||||
assert result["loyalty_discount"] is None
|
||||
|
||||
def test_upc_leading_zeros_stripped(self):
|
||||
raw = {
|
||||
"description": "TEST",
|
||||
"upc": "0000000004011",
|
||||
"unitPrice": 1.00,
|
||||
"totalPrice": 1.00,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["upc"] == "4011"
|
||||
|
||||
def test_description_whitespace_stripped(self):
|
||||
raw = {
|
||||
"description": " EXTRA SPACES ",
|
||||
"unitPrice": 1.00,
|
||||
"totalPrice": 1.00,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["product_name_raw"] == "EXTRA SPACES"
|
||||
|
||||
def test_circle_price_preferred_over_sale_price(self):
|
||||
raw = {
|
||||
"description": "CIRCLE ITEM",
|
||||
"circlePrice": 2.99,
|
||||
"salePrice": 3.49,
|
||||
"unitPrice": 2.99,
|
||||
"totalPrice": 2.99,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["sale_price"] == Decimal("2.99")
|
||||
|
||||
def test_sale_price_fallback_when_no_circle_price(self):
|
||||
raw = {
|
||||
"description": "SALE ITEM",
|
||||
"salePrice": 3.49,
|
||||
"unitPrice": 3.49,
|
||||
"totalPrice": 3.49,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["sale_price"] == Decimal("3.49")
|
||||
|
||||
def test_circle_rewards_discount(self):
|
||||
raw = {
|
||||
"description": "CIRCLE REWARDS ITEM",
|
||||
"circleRewardsDiscount": 1.50,
|
||||
"unitPrice": 5.00,
|
||||
"totalPrice": 5.00,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["loyalty_discount"] == Decimal("1.50")
|
||||
|
||||
def test_circle_discount_fallback(self):
|
||||
raw = {
|
||||
"description": "CIRCLE DISC ITEM",
|
||||
"circleDiscount": 0.75,
|
||||
"unitPrice": 3.00,
|
||||
"totalPrice": 3.00,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["loyalty_discount"] == Decimal("0.75")
|
||||
|
||||
def test_bogo_item(self):
|
||||
raw = {
|
||||
"description": "BOGO GOOD & GATHER PASTA",
|
||||
"upc": "0085239300456",
|
||||
"quantity": 2,
|
||||
"unitPrice": 1.79,
|
||||
"totalPrice": 1.79,
|
||||
"regularPrice": 1.79,
|
||||
"circlePrice": 0.895,
|
||||
"circleRewardsDiscount": 1.79,
|
||||
"promoDescription": "Buy 1 get 1 free",
|
||||
"department": "GROCERY",
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["quantity"] == Decimal("2")
|
||||
assert result["unit_price"] == Decimal("1.79")
|
||||
assert result["extended_price"] == Decimal("1.79")
|
||||
assert result["sale_price"] == Decimal("0.895")
|
||||
assert result["loyalty_discount"] == Decimal("1.79")
|
||||
|
||||
def test_multi_quantity_item(self):
|
||||
raw = {
|
||||
"description": "MARKET PANTRY EGGS",
|
||||
"quantity": 2,
|
||||
"unitPrice": 4.99,
|
||||
"totalPrice": 9.98,
|
||||
"department": "GROCERY",
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["quantity"] == Decimal("2")
|
||||
assert result["unit_price"] == Decimal("4.99")
|
||||
assert result["extended_price"] == Decimal("9.98")
|
||||
|
||||
def test_coupon_savings_field(self):
|
||||
raw = {
|
||||
"description": "COUPON ITEM",
|
||||
"couponSavings": 1.00,
|
||||
"unitPrice": 5.00,
|
||||
"totalPrice": 5.00,
|
||||
}
|
||||
result = _parse_item(raw)
|
||||
assert result["coupon_discount"] == Decimal("1.00")
|
||||
|
||||
|
||||
class TestParseTargetReceipt:
|
||||
def test_full_receipt(self, target_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="TGT-2026-0315-7890",
|
||||
purchase_date="2026-03-15T11:23:00Z",
|
||||
store_number="2774",
|
||||
raw_data=target_receipt_data,
|
||||
)
|
||||
result = parse_target_receipt(raw)
|
||||
|
||||
assert result["receipt_id"] == "TGT-2026-0315-7890"
|
||||
assert result["purchase_date"] == "2026-03-15T11:23:00Z"
|
||||
assert result["total"] == Decimal("83.21")
|
||||
assert result["subtotal"] == Decimal("78.32")
|
||||
assert result["tax"] == Decimal("4.89")
|
||||
assert result["savings_total"] == Decimal("11.45")
|
||||
|
||||
# Should have 8 items (voided + returned items excluded)
|
||||
assert len(result["items"]) == 8
|
||||
|
||||
# Verify first item
|
||||
milk = result["items"][0]
|
||||
assert milk["product_name_raw"] == "GOOD & GATHER WHOLE MILK GAL"
|
||||
assert milk["upc"] == "85239100123"
|
||||
|
||||
def test_voided_items_excluded(self, target_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="TGT-2026-0315-7890",
|
||||
purchase_date="2026-03-15",
|
||||
raw_data=target_receipt_data,
|
||||
)
|
||||
result = parse_target_receipt(raw)
|
||||
|
||||
item_names = [i["product_name_raw"] for i in result["items"]]
|
||||
assert "VOIDED COCA-COLA 12PK" not in item_names
|
||||
|
||||
def test_returned_items_excluded(self, target_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="TGT-2026-0315-7890",
|
||||
purchase_date="2026-03-15",
|
||||
raw_data=target_receipt_data,
|
||||
)
|
||||
result = parse_target_receipt(raw)
|
||||
|
||||
item_names = [i["product_name_raw"] for i in result["items"]]
|
||||
assert "RETURNED OLAY MOISTURIZER" not in item_names
|
||||
|
||||
def test_return_flag_items_excluded(self):
|
||||
data = {
|
||||
"detail": {
|
||||
"items": [
|
||||
{
|
||||
"description": "NORMAL ITEM",
|
||||
"unitPrice": 5.00,
|
||||
"totalPrice": 5.00,
|
||||
},
|
||||
{
|
||||
"description": "RETURNED VIA FLAG",
|
||||
"unitPrice": 3.00,
|
||||
"totalPrice": 3.00,
|
||||
"returnFlag": True,
|
||||
},
|
||||
{
|
||||
"description": "IS RETURN ITEM",
|
||||
"unitPrice": 2.00,
|
||||
"totalPrice": 2.00,
|
||||
"isReturn": True,
|
||||
},
|
||||
],
|
||||
"total": 5.00,
|
||||
}
|
||||
}
|
||||
raw = RawReceipt(
|
||||
receipt_id="RET-001",
|
||||
purchase_date="2026-03-15",
|
||||
raw_data=data,
|
||||
)
|
||||
result = parse_target_receipt(raw)
|
||||
assert len(result["items"]) == 1
|
||||
assert result["items"][0]["product_name_raw"] == "NORMAL ITEM"
|
||||
|
||||
def test_cancelled_items_excluded(self):
|
||||
data = {
|
||||
"detail": {
|
||||
"items": [
|
||||
{
|
||||
"description": "NORMAL ITEM",
|
||||
"unitPrice": 5.00,
|
||||
"totalPrice": 5.00,
|
||||
},
|
||||
{
|
||||
"description": "CANCELLED ITEM",
|
||||
"unitPrice": 3.00,
|
||||
"totalPrice": 3.00,
|
||||
"status": "CANCELLED",
|
||||
},
|
||||
],
|
||||
"total": 5.00,
|
||||
}
|
||||
}
|
||||
raw = RawReceipt(
|
||||
receipt_id="CAN-001",
|
||||
purchase_date="2026-03-15",
|
||||
raw_data=data,
|
||||
)
|
||||
result = parse_target_receipt(raw)
|
||||
assert len(result["items"]) == 1
|
||||
assert result["items"][0]["product_name_raw"] == "NORMAL ITEM"
|
||||
|
||||
def test_empty_receipt(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="EMPTY-001",
|
||||
purchase_date="2026-03-15",
|
||||
raw_data={"detail": {"items": [], "total": 0}},
|
||||
)
|
||||
result = parse_target_receipt(raw)
|
||||
assert result["items"] == []
|
||||
assert result["total"] == Decimal("0")
|
||||
|
||||
def test_receipt_with_no_detail(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="NO-DETAIL-001",
|
||||
purchase_date="2026-03-15",
|
||||
raw_data={"total": 50.00},
|
||||
)
|
||||
result = parse_target_receipt(raw)
|
||||
assert result["items"] == []
|
||||
assert result["total"] == Decimal("50.00")
|
||||
|
||||
def test_raw_data_preserved(self, target_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="TGT-2026-0315-7890",
|
||||
purchase_date="2026-03-15",
|
||||
raw_data=target_receipt_data,
|
||||
)
|
||||
result = parse_target_receipt(raw)
|
||||
assert result["raw_data"] is target_receipt_data
|
||||
|
||||
def test_alternative_total_field_names(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="ALT-001",
|
||||
purchase_date="2026-03-15",
|
||||
raw_data={
|
||||
"orderTotal": 42.00,
|
||||
"subTotal": 35.00,
|
||||
"salesTax": 3.50,
|
||||
"circleSavings": 5.00,
|
||||
"detail": {"items": []},
|
||||
},
|
||||
)
|
||||
result = parse_target_receipt(raw)
|
||||
assert result["total"] == Decimal("42.00")
|
||||
assert result["subtotal"] == Decimal("35.00")
|
||||
assert result["tax"] == Decimal("3.50")
|
||||
assert result["savings_total"] == Decimal("5.00")
|
||||
|
||||
def test_receipt_items_alternative_key(self):
|
||||
data = {
|
||||
"detail": {
|
||||
"lineItems": [
|
||||
{
|
||||
"description": "ALT KEY ITEM",
|
||||
"unitPrice": 3.00,
|
||||
"totalPrice": 3.00,
|
||||
}
|
||||
],
|
||||
"total": 3.00,
|
||||
}
|
||||
}
|
||||
raw = RawReceipt(
|
||||
receipt_id="ALT-KEY-001",
|
||||
purchase_date="2026-03-15",
|
||||
raw_data=data,
|
||||
)
|
||||
result = parse_target_receipt(raw)
|
||||
assert len(result["items"]) == 1
|
||||
assert result["items"][0]["product_name_raw"] == "ALT KEY ITEM"
|
||||
|
||||
def test_source_url_preserved(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="URL-001",
|
||||
purchase_date="2026-03-15",
|
||||
raw_data={"detail": {"items": [], "total": 0}},
|
||||
source_url="https://api.target.com/order_history/v1/orders/URL-001",
|
||||
)
|
||||
result = parse_target_receipt(raw)
|
||||
assert result["source_url"] == "https://api.target.com/order_history/v1/orders/URL-001"
|
||||
|
||||
def test_weighted_items_in_full_receipt(self, target_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="TGT-2026-0315-7890",
|
||||
purchase_date="2026-03-15",
|
||||
raw_data=target_receipt_data,
|
||||
)
|
||||
result = parse_target_receipt(raw)
|
||||
|
||||
# Find the weighted turkey item
|
||||
turkey = next(i for i in result["items"] if "TURKEY" in i["product_name_raw"])
|
||||
assert turkey["quantity"] == Decimal("0.72")
|
||||
assert turkey["unit_price"] == Decimal("10.99")
|
||||
assert turkey["extended_price"] == Decimal("7.91")
|
||||
|
||||
def test_bogo_items_in_full_receipt(self, target_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="TGT-2026-0315-7890",
|
||||
purchase_date="2026-03-15",
|
||||
raw_data=target_receipt_data,
|
||||
)
|
||||
result = parse_target_receipt(raw)
|
||||
|
||||
# Find the BOGO pasta item
|
||||
pasta = next(i for i in result["items"] if "BOGO" in i["product_name_raw"])
|
||||
assert pasta["quantity"] == Decimal("2")
|
||||
assert pasta["extended_price"] == Decimal("1.79")
|
||||
assert pasta["loyalty_discount"] == Decimal("1.79")
|
||||
|
||||
def test_grand_total_field(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="GT-001",
|
||||
purchase_date="2026-03-15",
|
||||
raw_data={"grandTotal": 99.99, "detail": {"items": []}},
|
||||
)
|
||||
result = parse_target_receipt(raw)
|
||||
assert result["total"] == Decimal("99.99")
|
||||
@@ -0,0 +1,23 @@
|
||||
"""Shared test fixtures for pipeline tests."""
|
||||
|
||||
import pytest
|
||||
from cartsnitch_common.models.base import Base
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def engine():
|
||||
"""In-memory SQLite engine for unit tests."""
|
||||
eng = create_engine("sqlite:///:memory:")
|
||||
Base.metadata.create_all(eng)
|
||||
yield eng
|
||||
eng.dispose()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def session(engine):
|
||||
"""SQLAlchemy session bound to in-memory SQLite."""
|
||||
factory = sessionmaker(bind=engine)
|
||||
with factory() as sess:
|
||||
yield sess
|
||||
@@ -0,0 +1,161 @@
|
||||
"""Tests for product matching & dedup pipeline."""
|
||||
|
||||
import uuid
|
||||
from datetime import UTC, datetime
|
||||
from decimal import Decimal
|
||||
|
||||
from cartsnitch_common.constants import MatchConfidence
|
||||
from cartsnitch_common.models.product import NormalizedProduct
|
||||
from cartsnitch_common.schemas.purchase import PurchaseItemCreate
|
||||
|
||||
from receiptwitness.pipeline.matching import (
|
||||
ProductMatcher,
|
||||
classify_confidence,
|
||||
match_purchase_item,
|
||||
)
|
||||
from receiptwitness.pipeline.normalization import MatchMethod
|
||||
|
||||
|
||||
class TestClassifyConfidence:
|
||||
def test_upc_always_high(self):
|
||||
assert classify_confidence(1.0, MatchMethod.UPC) == MatchConfidence.HIGH
|
||||
assert classify_confidence(0.5, MatchMethod.UPC) == MatchConfidence.HIGH
|
||||
|
||||
def test_name_high(self):
|
||||
assert classify_confidence(0.9, MatchMethod.NAME) == MatchConfidence.HIGH
|
||||
assert classify_confidence(0.8, MatchMethod.NAME) == MatchConfidence.HIGH
|
||||
|
||||
def test_name_medium(self):
|
||||
assert classify_confidence(0.6, MatchMethod.NAME) == MatchConfidence.MEDIUM
|
||||
assert classify_confidence(0.5, MatchMethod.NAME) == MatchConfidence.MEDIUM
|
||||
|
||||
def test_name_low(self):
|
||||
assert classify_confidence(0.3, MatchMethod.NAME) == MatchConfidence.LOW
|
||||
assert classify_confidence(0.0, MatchMethod.NAME) == MatchConfidence.LOW
|
||||
|
||||
|
||||
class TestProductMatcher:
|
||||
def _make_item(self, name: str, upc: str | None = None) -> PurchaseItemCreate:
|
||||
return PurchaseItemCreate(
|
||||
product_name_raw=name,
|
||||
upc=upc,
|
||||
unit_price=Decimal("3.99"),
|
||||
extended_price=Decimal("3.99"),
|
||||
)
|
||||
|
||||
def test_match_by_upc(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Whole Milk Gallon",
|
||||
upc_variants=["041250000001"],
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
|
||||
matcher = ProductMatcher(session)
|
||||
item = self._make_item("Kroger Milk", upc="041250000001")
|
||||
prod, result, confidence = matcher.match_single(item)
|
||||
|
||||
assert prod is not None
|
||||
assert prod.id == product.id
|
||||
assert result is not None
|
||||
assert result.method == MatchMethod.UPC
|
||||
assert confidence == MatchConfidence.HIGH
|
||||
|
||||
def test_match_by_name(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Whole Milk Gallon",
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
|
||||
matcher = ProductMatcher(session, name_threshold=0.3)
|
||||
item = self._make_item("Whole Milk Gallon Size")
|
||||
prod, result, confidence = matcher.match_single(item)
|
||||
|
||||
assert prod is not None
|
||||
assert result is not None
|
||||
assert result.method == MatchMethod.NAME
|
||||
|
||||
def test_auto_create_when_no_match(self, session):
|
||||
matcher = ProductMatcher(session, auto_create=True)
|
||||
item = self._make_item("Unique Product XYZ 16 oz")
|
||||
prod, result, confidence = matcher.match_single(item)
|
||||
|
||||
assert prod is not None
|
||||
assert result is None # No match found, was created
|
||||
assert confidence == MatchConfidence.LOW
|
||||
assert prod.canonical_name == "Unique Product XYZ 16 oz"
|
||||
assert prod.size == "16"
|
||||
assert prod.size_unit == "oz"
|
||||
|
||||
def test_no_create_when_disabled(self, session):
|
||||
matcher = ProductMatcher(session, auto_create=False)
|
||||
item = self._make_item("Nonexistent Product")
|
||||
prod, result, confidence = matcher.match_single(item)
|
||||
|
||||
assert prod is None
|
||||
assert result is None
|
||||
|
||||
def test_batch_match(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Large Eggs 12 Count",
|
||||
upc_variants=["012345"],
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
|
||||
matcher = ProductMatcher(session)
|
||||
items = [
|
||||
self._make_item("Large Eggs", upc="012345"),
|
||||
self._make_item("Brand New Never Seen Product"),
|
||||
]
|
||||
outcomes = matcher.match_items(items)
|
||||
|
||||
assert len(outcomes) == 2
|
||||
assert outcomes[0].match is not None
|
||||
assert outcomes[0].confidence_level == MatchConfidence.HIGH
|
||||
assert outcomes[0].created_new is False
|
||||
assert outcomes[1].match is None
|
||||
assert outcomes[1].created_new is True
|
||||
|
||||
|
||||
class TestMatchPurchaseItem:
|
||||
def test_convenience_function(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Ground Beef 80/20",
|
||||
upc_variants=["999888"],
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
|
||||
item = PurchaseItemCreate(
|
||||
product_name_raw="Ground Beef",
|
||||
upc="999888",
|
||||
unit_price=Decimal("5.99"),
|
||||
extended_price=Decimal("5.99"),
|
||||
)
|
||||
prod, confidence = match_purchase_item(session, item)
|
||||
assert prod is not None
|
||||
assert confidence == MatchConfidence.HIGH
|
||||
|
||||
def test_auto_create_default(self, session):
|
||||
item = PurchaseItemCreate(
|
||||
product_name_raw="Totally New Item",
|
||||
unit_price=Decimal("1.00"),
|
||||
extended_price=Decimal("1.00"),
|
||||
)
|
||||
prod, confidence = match_purchase_item(session, item)
|
||||
assert prod is not None
|
||||
assert confidence == MatchConfidence.LOW
|
||||
@@ -0,0 +1,158 @@
|
||||
"""Tests for product normalization module."""
|
||||
|
||||
import uuid
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from cartsnitch_common.models.product import NormalizedProduct
|
||||
|
||||
from receiptwitness.pipeline.normalization import (
|
||||
MatchMethod,
|
||||
clean_name,
|
||||
extract_size_info,
|
||||
jaccard_similarity,
|
||||
match_by_name,
|
||||
match_by_upc,
|
||||
normalize_product,
|
||||
)
|
||||
|
||||
|
||||
class TestCleanName:
|
||||
def test_lowercase(self):
|
||||
assert clean_name("Kroger WHOLE MILK") == "kroger whole milk"
|
||||
|
||||
def test_removes_size_info(self):
|
||||
assert "oz" not in clean_name("Milk 16 oz Whole")
|
||||
|
||||
def test_removes_noise_words(self):
|
||||
cleaned = clean_name("The Original Brand Milk")
|
||||
assert "the" not in cleaned.split()
|
||||
assert "original" not in cleaned.split()
|
||||
assert "brand" not in cleaned.split()
|
||||
|
||||
def test_collapses_whitespace(self):
|
||||
assert " " not in clean_name("Milk Whole Gallon")
|
||||
|
||||
def test_removes_punctuation(self):
|
||||
cleaned = clean_name("Meijer's Best (Organic) Milk!")
|
||||
assert "'" not in cleaned
|
||||
assert "(" not in cleaned
|
||||
|
||||
|
||||
class TestExtractSizeInfo:
|
||||
def test_extracts_oz(self):
|
||||
result = extract_size_info("Cereal 18 oz box")
|
||||
assert result == ("18", "oz")
|
||||
|
||||
def test_extracts_fl_oz(self):
|
||||
result = extract_size_info("Juice 64 fl oz")
|
||||
assert result == ("64", "fl_oz")
|
||||
|
||||
def test_extracts_lb(self):
|
||||
result = extract_size_info("Ground Beef 1.5 lb")
|
||||
assert result == ("1.5", "lb")
|
||||
|
||||
def test_extracts_ct(self):
|
||||
result = extract_size_info("Eggs Large 12 ct")
|
||||
assert result == ("12", "ct")
|
||||
|
||||
def test_no_size_returns_none(self):
|
||||
assert extract_size_info("Bananas") is None
|
||||
|
||||
|
||||
class TestJaccardSimilarity:
|
||||
def test_identical_strings(self):
|
||||
assert jaccard_similarity("whole milk gallon", "whole milk gallon") == 1.0
|
||||
|
||||
def test_completely_different(self):
|
||||
assert jaccard_similarity("apple juice", "ground beef") == 0.0
|
||||
|
||||
def test_partial_overlap(self):
|
||||
score = jaccard_similarity("kroger whole milk", "meijer whole milk")
|
||||
assert 0.4 < score < 0.8 # "whole" and "milk" overlap
|
||||
|
||||
def test_empty_strings(self):
|
||||
assert jaccard_similarity("", "") == 0.0
|
||||
assert jaccard_similarity("milk", "") == 0.0
|
||||
|
||||
|
||||
class TestMatchByUPC:
|
||||
def test_match_found(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Whole Milk, Gallon",
|
||||
upc_variants=["0041250000001", "0041250000002"],
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
# SQLite doesn't support JSONB containment — this will raise
|
||||
# In production (PostgreSQL), this would work
|
||||
result = match_by_upc(session, "0041250000001")
|
||||
assert result is not None
|
||||
assert result.method == MatchMethod.UPC
|
||||
assert result.confidence == 1.0
|
||||
|
||||
def test_no_match(self, session):
|
||||
result = match_by_upc(session, "9999999999999")
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestMatchByName:
|
||||
def test_exact_name_match(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Whole Milk, Gallon",
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
result = match_by_name(session, "Whole Milk Gallon")
|
||||
assert result is not None
|
||||
assert result.method == MatchMethod.NAME
|
||||
assert result.confidence > 0.5
|
||||
|
||||
def test_fuzzy_match(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Kroger Whole Milk, 1 Gallon",
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
result = match_by_name(session, "Meijer Whole Milk 1 Gallon", threshold=0.3)
|
||||
assert result is not None
|
||||
assert result.confidence > 0.3
|
||||
|
||||
def test_no_match_below_threshold(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Ground Beef 80/20",
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
result = match_by_name(session, "Apple Juice 64 oz", threshold=0.5)
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestNormalizeProduct:
|
||||
def test_name_fallback(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Large Eggs, 12 count",
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
result = normalize_product(session, "Large Eggs 12 ct", upc=None)
|
||||
assert result is not None
|
||||
assert result.method == MatchMethod.NAME
|
||||
|
||||
def test_no_match(self, session):
|
||||
result = normalize_product(session, "Nonexistent Product XYZ", upc=None)
|
||||
assert result is None
|
||||
@@ -0,0 +1,204 @@
|
||||
"""Tests for receipt normalization pipeline."""
|
||||
|
||||
import uuid
|
||||
from datetime import date
|
||||
from decimal import Decimal
|
||||
|
||||
from receiptwitness.pipeline.receipt import (
|
||||
_clean_product_name,
|
||||
_safe_decimal,
|
||||
normalize_receipt,
|
||||
parse_meijer_item,
|
||||
)
|
||||
|
||||
|
||||
class TestCleanProductName:
|
||||
def test_strips_whitespace(self):
|
||||
assert _clean_product_name(" Milk ") == "Milk"
|
||||
|
||||
def test_removes_leading_punctuation(self):
|
||||
assert _clean_product_name("---Milk---") == "Milk"
|
||||
|
||||
def test_collapses_internal_whitespace(self):
|
||||
assert _clean_product_name("Whole Milk Gallon") == "Whole Milk Gallon"
|
||||
|
||||
def test_empty_string(self):
|
||||
assert _clean_product_name("") == ""
|
||||
|
||||
|
||||
class TestSafeDecimal:
|
||||
def test_string_input(self):
|
||||
assert _safe_decimal("3.99") == Decimal("3.99")
|
||||
|
||||
def test_float_input(self):
|
||||
assert _safe_decimal(3.99) == Decimal("3.99")
|
||||
|
||||
def test_int_input(self):
|
||||
assert _safe_decimal(4) == Decimal("4")
|
||||
|
||||
def test_none_returns_default(self):
|
||||
assert _safe_decimal(None) == Decimal("0")
|
||||
|
||||
def test_none_custom_default(self):
|
||||
assert _safe_decimal(None, Decimal("1")) == Decimal("1")
|
||||
|
||||
def test_invalid_returns_default(self):
|
||||
assert _safe_decimal("not-a-number") == Decimal("0")
|
||||
|
||||
def test_decimal_passthrough(self):
|
||||
assert _safe_decimal(Decimal("5.50")) == Decimal("5.50")
|
||||
|
||||
|
||||
class TestParseMeijerItem:
|
||||
def test_basic_item(self):
|
||||
raw = {
|
||||
"description": "Kroger Whole Milk 1 Gallon",
|
||||
"upc": "0041250000001",
|
||||
"quantity": 1,
|
||||
"unitPrice": "3.99",
|
||||
"extendedPrice": "3.99",
|
||||
"category": "DAIRY",
|
||||
}
|
||||
item = parse_meijer_item(raw)
|
||||
assert item.product_name_raw == "Kroger Whole Milk 1 Gallon"
|
||||
assert item.upc == "41250000001" # leading zeros stripped
|
||||
assert item.quantity == Decimal("1")
|
||||
assert item.unit_price == Decimal("3.99")
|
||||
assert item.extended_price == Decimal("3.99")
|
||||
assert item.category_raw == "DAIRY"
|
||||
|
||||
def test_alternate_field_names(self):
|
||||
raw = {
|
||||
"name": "Eggs Large 12 ct",
|
||||
"upcCode": "012345",
|
||||
"qty": 2,
|
||||
"price": "4.50",
|
||||
"totalPrice": "9.00",
|
||||
"department": "EGGS",
|
||||
}
|
||||
item = parse_meijer_item(raw)
|
||||
assert item.product_name_raw == "Eggs Large 12 ct"
|
||||
assert item.upc == "12345"
|
||||
assert item.quantity == Decimal("2")
|
||||
assert item.unit_price == Decimal("4.50")
|
||||
assert item.extended_price == Decimal("9.00")
|
||||
assert item.category_raw == "EGGS"
|
||||
|
||||
def test_calculates_extended_from_unit_price(self):
|
||||
raw = {
|
||||
"description": "Bananas",
|
||||
"unitPrice": "0.59",
|
||||
"quantity": 3,
|
||||
}
|
||||
item = parse_meijer_item(raw)
|
||||
assert item.extended_price == Decimal("1.77")
|
||||
|
||||
def test_discounts_parsed(self):
|
||||
raw = {
|
||||
"description": "Cereal",
|
||||
"unitPrice": "4.99",
|
||||
"extendedPrice": "4.99",
|
||||
"regularPrice": "5.99",
|
||||
"salePrice": "4.99",
|
||||
"couponAmount": "1.00",
|
||||
"loyaltyAmount": "0.50",
|
||||
}
|
||||
item = parse_meijer_item(raw)
|
||||
assert item.regular_price == Decimal("5.99")
|
||||
assert item.sale_price == Decimal("4.99")
|
||||
assert item.coupon_discount == Decimal("1.00")
|
||||
assert item.loyalty_discount == Decimal("0.50")
|
||||
|
||||
def test_alternate_discount_names(self):
|
||||
raw = {
|
||||
"description": "Bread",
|
||||
"unitPrice": "2.99",
|
||||
"extendedPrice": "2.99",
|
||||
"couponDiscount": "0.75",
|
||||
"loyaltyDiscount": "0.25",
|
||||
}
|
||||
item = parse_meijer_item(raw)
|
||||
assert item.coupon_discount == Decimal("0.75")
|
||||
assert item.loyalty_discount == Decimal("0.25")
|
||||
|
||||
def test_missing_fields_default_gracefully(self):
|
||||
raw = {"description": "Mystery Item"}
|
||||
item = parse_meijer_item(raw)
|
||||
assert item.product_name_raw == "Mystery Item"
|
||||
assert item.upc is None
|
||||
assert item.quantity == Decimal("1")
|
||||
assert item.unit_price == Decimal("0")
|
||||
assert item.regular_price is None
|
||||
assert item.category_raw is None
|
||||
|
||||
def test_no_upc_returns_none(self):
|
||||
raw = {"description": "Loose Bananas", "unitPrice": "1.00", "extendedPrice": "1.00"}
|
||||
item = parse_meijer_item(raw)
|
||||
assert item.upc is None
|
||||
|
||||
|
||||
class TestNormalizeReceipt:
|
||||
def test_full_receipt(self):
|
||||
user_id = str(uuid.uuid4())
|
||||
store_id = str(uuid.uuid4())
|
||||
raw = {
|
||||
"receiptId": "REC-001",
|
||||
"date": "2026-03-15",
|
||||
"total": "25.47",
|
||||
"subtotal": "23.00",
|
||||
"tax": "2.47",
|
||||
"savings": "3.00",
|
||||
"items": [
|
||||
{"description": "Milk", "unitPrice": "3.99", "extendedPrice": "3.99"},
|
||||
{"description": "Bread", "unitPrice": "2.50", "extendedPrice": "2.50"},
|
||||
],
|
||||
}
|
||||
purchase = normalize_receipt(raw, user_id, store_id)
|
||||
assert purchase.receipt_id == "REC-001"
|
||||
assert purchase.purchase_date == date(2026, 3, 15)
|
||||
assert purchase.total == Decimal("25.47")
|
||||
assert purchase.subtotal == Decimal("23.00")
|
||||
assert purchase.tax == Decimal("2.47")
|
||||
assert purchase.savings_total == Decimal("3.00")
|
||||
assert len(purchase.items) == 2
|
||||
assert purchase.items[0].product_name_raw == "Milk"
|
||||
assert purchase.raw_data == raw
|
||||
|
||||
def test_alternate_receipt_fields(self):
|
||||
user_id = str(uuid.uuid4())
|
||||
store_id = str(uuid.uuid4())
|
||||
raw = {
|
||||
"receipt_id": "REC-002",
|
||||
"purchaseDate": "2026-03-14",
|
||||
"totalAmount": "10.00",
|
||||
"taxAmount": "0.75",
|
||||
"totalSavings": "1.50",
|
||||
"items": [],
|
||||
}
|
||||
purchase = normalize_receipt(raw, user_id, store_id)
|
||||
assert purchase.receipt_id == "REC-002"
|
||||
assert purchase.purchase_date == date(2026, 3, 14)
|
||||
assert purchase.total == Decimal("10.00")
|
||||
assert purchase.tax == Decimal("0.75")
|
||||
assert purchase.savings_total == Decimal("1.50")
|
||||
|
||||
def test_missing_date_defaults_to_today(self):
|
||||
user_id = str(uuid.uuid4())
|
||||
store_id = str(uuid.uuid4())
|
||||
raw = {"total": "5.00", "items": []}
|
||||
purchase = normalize_receipt(raw, user_id, store_id)
|
||||
assert purchase.purchase_date == date.today()
|
||||
|
||||
def test_generates_receipt_id_if_missing(self):
|
||||
user_id = str(uuid.uuid4())
|
||||
store_id = str(uuid.uuid4())
|
||||
raw = {"total": "5.00", "date": "2026-03-15", "items": []}
|
||||
purchase = normalize_receipt(raw, user_id, store_id)
|
||||
assert purchase.receipt_id # Should be a generated UUID string
|
||||
|
||||
def test_date_object_passthrough(self):
|
||||
user_id = str(uuid.uuid4())
|
||||
store_id = str(uuid.uuid4())
|
||||
raw = {"date": date(2026, 1, 1), "total": "5.00", "items": []}
|
||||
purchase = normalize_receipt(raw, user_id, store_id)
|
||||
assert purchase.purchase_date == date(2026, 1, 1)
|
||||
@@ -0,0 +1,435 @@
|
||||
"""Regression tests: graceful handling of page layout changes.
|
||||
|
||||
Retailers frequently change their API response structures, field names,
|
||||
and nesting. These tests verify that both parsers degrade gracefully when
|
||||
encountering alternative or missing fields — producing valid output
|
||||
instead of crashing.
|
||||
"""
|
||||
|
||||
from decimal import Decimal
|
||||
|
||||
from receiptwitness.parsers.kroger import parse_kroger_receipt
|
||||
from receiptwitness.parsers.meijer import parse_meijer_receipt
|
||||
from receiptwitness.scrapers.base import RawReceipt
|
||||
|
||||
|
||||
class TestKrogerFieldNameVariations:
|
||||
"""Kroger changes field names between app versions and API revisions."""
|
||||
|
||||
def test_alternative_item_key_line_items(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-ALT-1",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"lineItems": [{"description": "MILK", "basePrice": 3.99, "totalPrice": 3.99}],
|
||||
"total": 3.99,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert len(result["items"]) == 1
|
||||
assert result["items"][0]["product_name_raw"] == "MILK"
|
||||
|
||||
def test_alternative_item_key_receipt_items(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-ALT-2",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"receiptItems": [
|
||||
{"description": "EGGS", "basePrice": 5.49, "totalPrice": 5.49}
|
||||
],
|
||||
"total": 5.49,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert len(result["items"]) == 1
|
||||
assert result["items"][0]["product_name_raw"] == "EGGS"
|
||||
|
||||
def test_alternative_description_fields(self):
|
||||
"""Test productName and itemDescription fallbacks."""
|
||||
for field in ("productName", "itemDescription", "name"):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-DESC",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [{field: "TEST PRODUCT", "basePrice": 1.00, "totalPrice": 1.00}],
|
||||
"total": 1.00,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["items"][0]["product_name_raw"] == "TEST PRODUCT"
|
||||
|
||||
def test_alternative_price_fields(self):
|
||||
"""Test unitPrice and price fallbacks for basePrice."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-PRICE-1",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [{"description": "ITEM A", "unitPrice": 2.50, "totalPrice": 2.50}],
|
||||
"total": 2.50,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["items"][0]["unit_price"] == Decimal("2.50")
|
||||
|
||||
raw2 = RawReceipt(
|
||||
receipt_id="KR-PRICE-2",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [{"description": "ITEM B", "price": 4.00, "totalPrice": 4.00}],
|
||||
"total": 4.00,
|
||||
}
|
||||
},
|
||||
)
|
||||
result2 = parse_kroger_receipt(raw2)
|
||||
assert result2["items"][0]["unit_price"] == Decimal("4.00")
|
||||
|
||||
def test_alternative_total_fields(self):
|
||||
"""Test orderTotal, grandTotal fallbacks."""
|
||||
for field in ("orderTotal", "grandTotal"):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-TOT",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={field: 42.50, "detail": {}},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["total"] == Decimal("42.50")
|
||||
|
||||
def test_alternative_savings_fields(self):
|
||||
"""Test youSaved and totalDiscount fallbacks."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-SAV-1",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={"youSaved": 5.00, "detail": {}},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["savings_total"] == Decimal("5.00")
|
||||
|
||||
def test_alternative_tax_field(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-TAX",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={"salesTax": 3.25, "detail": {}},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["tax"] == Decimal("3.25")
|
||||
|
||||
def test_alternative_quantity_field_qty(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-QTY",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [
|
||||
{"description": "APPLES", "qty": 5, "basePrice": 1.00, "totalPrice": 5.00}
|
||||
],
|
||||
"total": 5.00,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["items"][0]["quantity"] == Decimal("5")
|
||||
|
||||
def test_alternative_upc_field_kroger_product_id(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-UPC",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [
|
||||
{
|
||||
"description": "ITEM",
|
||||
"krogerProductId": "12345678",
|
||||
"basePrice": 1.00,
|
||||
"totalPrice": 1.00,
|
||||
}
|
||||
],
|
||||
"total": 1.00,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["items"][0]["upc"] == "12345678"
|
||||
|
||||
def test_missing_extended_price_computed(self):
|
||||
"""When totalPrice is missing, extended_price = unit_price * quantity."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-CALC",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [{"description": "EGGS", "basePrice": 5.49, "quantity": 2}],
|
||||
"total": 10.98,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["items"][0]["extended_price"] == Decimal("5.49") * Decimal("2")
|
||||
|
||||
|
||||
class TestMeijerFieldNameVariations:
|
||||
"""Meijer XHR endpoints may change field names between SPA versions."""
|
||||
|
||||
def test_alternative_item_key_line_items(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="MJ-ALT-1",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"lineItems": [{"description": "BANANAS", "price": 0.69, "extendedPrice": 0.69}],
|
||||
"total": 0.69,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert len(result["items"]) == 1
|
||||
assert result["items"][0]["product_name_raw"] == "BANANAS"
|
||||
|
||||
def test_alternative_description_fields(self):
|
||||
for field in ("itemDescription", "name"):
|
||||
raw = RawReceipt(
|
||||
receipt_id="MJ-DESC",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [{field: "TEST ITEM", "price": 1.00, "extendedPrice": 1.00}],
|
||||
"total": 1.00,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["items"][0]["product_name_raw"] == "TEST ITEM"
|
||||
|
||||
def test_alternative_price_field_unit_price(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="MJ-PRICE",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [{"description": "MILK", "unitPrice": 3.49, "totalPrice": 3.49}],
|
||||
"total": 3.49,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["items"][0]["unit_price"] == Decimal("3.49")
|
||||
|
||||
def test_alternative_extended_price_field_total_price(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="MJ-EXT",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [{"description": "CEREAL", "price": 4.99, "totalPrice": 4.99}],
|
||||
"total": 4.99,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["items"][0]["extended_price"] == Decimal("4.99")
|
||||
|
||||
def test_alternative_total_field_transaction_total(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="MJ-TOT",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={"transactionTotal": 55.00, "detail": {}},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["total"] == Decimal("55.00")
|
||||
|
||||
def test_alternative_loyalty_field(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="MJ-LOY",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [
|
||||
{
|
||||
"description": "ITEM",
|
||||
"price": 5.00,
|
||||
"extendedPrice": 5.00,
|
||||
"loyaltyDiscount": 0.50,
|
||||
}
|
||||
],
|
||||
"total": 5.00,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["items"][0]["loyalty_discount"] == Decimal("0.50")
|
||||
|
||||
def test_alternative_upc_field_uppercase(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="MJ-UPC",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [
|
||||
{
|
||||
"description": "ITEM",
|
||||
"UPC": "0012345678",
|
||||
"price": 1.00,
|
||||
"extendedPrice": 1.00,
|
||||
}
|
||||
],
|
||||
"total": 1.00,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["items"][0]["upc"] == "12345678"
|
||||
|
||||
def test_alternative_category_field(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="MJ-CAT",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [
|
||||
{
|
||||
"description": "ITEM",
|
||||
"price": 1.00,
|
||||
"extendedPrice": 1.00,
|
||||
"departmentDescription": "FROZEN",
|
||||
}
|
||||
],
|
||||
"total": 1.00,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["items"][0]["category_raw"] == "FROZEN"
|
||||
|
||||
def test_missing_extended_price_computed(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="MJ-CALC",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [{"description": "MILK", "price": 3.49, "quantity": 2}],
|
||||
"total": 6.98,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["items"][0]["extended_price"] == Decimal("3.49") * Decimal("2")
|
||||
|
||||
def test_missing_description_fallback(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="MJ-NODESC",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [{"price": 1.00, "extendedPrice": 1.00}],
|
||||
"total": 1.00,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["items"][0]["product_name_raw"] == "UNKNOWN ITEM"
|
||||
|
||||
|
||||
class TestMixedFieldVersions:
|
||||
"""Test receipts that mix field naming conventions (happens during rollouts)."""
|
||||
|
||||
def test_kroger_mixed_item_fields(self):
|
||||
"""Some items use old names, some use new names in same receipt."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-MIX",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [
|
||||
{"description": "OLD STYLE", "basePrice": 2.00, "totalPrice": 2.00},
|
||||
{"productName": "NEW STYLE", "unitPrice": 3.00, "extendedAmount": 3.00},
|
||||
],
|
||||
"total": 5.00,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert len(result["items"]) == 2
|
||||
assert result["items"][0]["product_name_raw"] == "OLD STYLE"
|
||||
assert result["items"][0]["unit_price"] == Decimal("2.00")
|
||||
assert result["items"][1]["product_name_raw"] == "NEW STYLE"
|
||||
assert result["items"][1]["unit_price"] == Decimal("3.00")
|
||||
|
||||
def test_kroger_completely_unknown_structure_no_crash(self):
|
||||
"""Receipt with unrecognized structure should return empty items."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-UNKNOWN",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={"something_unexpected": [1, 2, 3], "detail": {"foo": "bar"}},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["receipt_id"] == "KR-UNKNOWN"
|
||||
assert result["items"] == []
|
||||
|
||||
def test_meijer_completely_unknown_structure_no_crash(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="MJ-UNKNOWN",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={"something_unexpected": [1, 2, 3], "detail": {"foo": "bar"}},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["receipt_id"] == "MJ-UNKNOWN"
|
||||
assert result["items"] == []
|
||||
|
||||
def test_kroger_null_fields_no_crash(self):
|
||||
"""Fields with None values should be handled gracefully."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-NULL",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [
|
||||
{
|
||||
"description": "ITEM",
|
||||
"basePrice": None,
|
||||
"totalPrice": None,
|
||||
"quantity": None,
|
||||
"upc": None,
|
||||
"department": None,
|
||||
}
|
||||
],
|
||||
"total": None,
|
||||
"subtotal": None,
|
||||
"tax": None,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["items"][0]["product_name_raw"] == "ITEM"
|
||||
assert result["items"][0]["unit_price"] == Decimal("0")
|
||||
|
||||
def test_meijer_null_fields_no_crash(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="MJ-NULL",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [
|
||||
{
|
||||
"description": "ITEM",
|
||||
"price": None,
|
||||
"extendedPrice": None,
|
||||
"quantity": None,
|
||||
"upc": None,
|
||||
"category": None,
|
||||
}
|
||||
],
|
||||
"total": None,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["items"][0]["product_name_raw"] == "ITEM"
|
||||
assert result["items"][0]["unit_price"] == Decimal("0")
|
||||
@@ -0,0 +1,365 @@
|
||||
"""Regression tests: rate limiting and retry behavior.
|
||||
|
||||
Validates that scrapers enforce human-like delays between requests
|
||||
and handle rate-limit/error responses gracefully without infinite retries.
|
||||
"""
|
||||
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from receiptwitness.scrapers.base import SessionData
|
||||
from receiptwitness.scrapers.kroger import DEFAULT_USER_AGENT, KrogerScraper
|
||||
from receiptwitness.scrapers.meijer import MeijerScraper
|
||||
|
||||
|
||||
class TestHumanDelayBehavior:
|
||||
"""Verify that human_delay respects configured bounds."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_delay_within_bounds(self):
|
||||
"""human_delay should sleep between min_ms/1000 and max_ms/1000 seconds."""
|
||||
scraper = KrogerScraper()
|
||||
sleep_path = "receiptwitness.scrapers.base.asyncio.sleep"
|
||||
with patch(sleep_path, new_callable=AsyncMock) as mock_sleep:
|
||||
await scraper.human_delay(100, 200)
|
||||
mock_sleep.assert_called_once()
|
||||
delay = mock_sleep.call_args[0][0]
|
||||
assert 0.1 <= delay <= 0.2
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_delay_uses_settings_defaults(self):
|
||||
"""Without explicit args, should use settings.min/max_request_delay_ms."""
|
||||
scraper = MeijerScraper()
|
||||
sleep_path = "receiptwitness.scrapers.base.asyncio.sleep"
|
||||
with (
|
||||
patch("receiptwitness.scrapers.base.settings") as mock_settings,
|
||||
patch(sleep_path, new_callable=AsyncMock) as mock_sleep,
|
||||
):
|
||||
mock_settings.min_request_delay_ms = 1000
|
||||
mock_settings.max_request_delay_ms = 5000
|
||||
await scraper.human_delay()
|
||||
mock_sleep.assert_called_once()
|
||||
delay = mock_sleep.call_args[0][0]
|
||||
assert 1.0 <= delay <= 5.0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_delay_is_randomized(self):
|
||||
"""Multiple calls should produce different delays (probabilistic)."""
|
||||
scraper = KrogerScraper()
|
||||
delays = []
|
||||
sleep_path2 = "receiptwitness.scrapers.base.asyncio.sleep"
|
||||
with patch(sleep_path2, new_callable=AsyncMock) as mock_sleep:
|
||||
for _ in range(20):
|
||||
await scraper.human_delay(100, 5000)
|
||||
delays.append(mock_sleep.call_args[0][0])
|
||||
# With range 100-5000ms, 20 calls should have at least 2 distinct values
|
||||
assert len(set(delays)) >= 2
|
||||
|
||||
|
||||
class TestKrogerRateLimiting:
|
||||
"""Verify Kroger scraper calls human_delay between receipt fetches."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_delay_called_between_receipts(self):
|
||||
"""Scraper must call human_delay for each receipt detail fetch."""
|
||||
scraper = KrogerScraper()
|
||||
valid_session = SessionData(
|
||||
cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}],
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
created_at=datetime.now(UTC),
|
||||
expires_at=datetime.now(UTC) + timedelta(hours=2),
|
||||
)
|
||||
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = True
|
||||
mock_api_response.json = AsyncMock(
|
||||
return_value={
|
||||
"orders": [
|
||||
{
|
||||
"orderId": f"KR-{i}",
|
||||
"purchaseDate": "2026-03-10T14:00:00Z",
|
||||
"storeNumber": "357",
|
||||
}
|
||||
for i in range(3)
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
mock_detail_response = AsyncMock()
|
||||
mock_detail_response.ok = True
|
||||
mock_detail_response.json = AsyncMock(return_value={})
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(side_effect=[mock_api_response] + [mock_detail_response] * 3)
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock) as mock_delay,
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
|
||||
assert len(receipts) == 3
|
||||
# human_delay called at least once per receipt (after initial page nav)
|
||||
# Plus once for the initial navigation delay
|
||||
assert mock_delay.call_count >= 3
|
||||
|
||||
|
||||
class TestMeijerRateLimiting:
|
||||
"""Verify Meijer scraper calls human_delay between receipt fetches."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_delay_called_between_receipts(self):
|
||||
scraper = MeijerScraper()
|
||||
valid_session = SessionData(
|
||||
cookies=[{"name": "s", "value": "v", "domain": ".meijer.com", "path": "/"}],
|
||||
user_agent="test",
|
||||
created_at=datetime.now(UTC),
|
||||
expires_at=datetime.now(UTC) + timedelta(hours=4),
|
||||
)
|
||||
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = True
|
||||
mock_api_response.json = AsyncMock(
|
||||
return_value={
|
||||
"transactions": [
|
||||
{
|
||||
"transactionId": f"TXN-{i}",
|
||||
"transactionDate": "2026-03-10T14:00:00Z",
|
||||
"storeNumber": "42",
|
||||
}
|
||||
for i in range(3)
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
mock_detail_response = AsyncMock()
|
||||
mock_detail_response.ok = True
|
||||
mock_detail_response.json = AsyncMock(return_value={})
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(side_effect=[mock_api_response] + [mock_detail_response] * 3)
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock) as mock_delay,
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
|
||||
assert len(receipts) == 3
|
||||
assert mock_delay.call_count >= 3
|
||||
|
||||
|
||||
class TestGracefulErrorRecovery:
|
||||
"""Scrapers should not retry endlessly on errors."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_kroger_api_500_returns_empty_not_retry(self):
|
||||
"""500 error should return empty list, not retry."""
|
||||
scraper = KrogerScraper()
|
||||
valid_session = SessionData(
|
||||
cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}],
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
created_at=datetime.now(UTC),
|
||||
expires_at=datetime.now(UTC) + timedelta(hours=2),
|
||||
)
|
||||
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = False
|
||||
mock_api_response.status = 500
|
||||
mock_api_response.status_text = "Internal Server Error"
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(return_value=mock_api_response)
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
assert receipts == []
|
||||
# Should only call the API once — no retries
|
||||
assert mock_request.get.call_count == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_kroger_429_returns_empty_not_retry(self):
|
||||
"""Rate limit (429) should return empty, not retry."""
|
||||
scraper = KrogerScraper()
|
||||
valid_session = SessionData(
|
||||
cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}],
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
created_at=datetime.now(UTC),
|
||||
expires_at=datetime.now(UTC) + timedelta(hours=2),
|
||||
)
|
||||
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = False
|
||||
mock_api_response.status = 429
|
||||
mock_api_response.status_text = "Too Many Requests"
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(return_value=mock_api_response)
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
assert receipts == []
|
||||
assert mock_request.get.call_count == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_meijer_detail_exception_continues(self):
|
||||
"""Exception fetching one receipt detail should not abort remaining receipts."""
|
||||
scraper = MeijerScraper()
|
||||
valid_session = SessionData(
|
||||
cookies=[{"name": "s", "value": "v", "domain": ".meijer.com", "path": "/"}],
|
||||
user_agent="test",
|
||||
created_at=datetime.now(UTC),
|
||||
expires_at=datetime.now(UTC) + timedelta(hours=4),
|
||||
)
|
||||
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = True
|
||||
mock_api_response.json = AsyncMock(
|
||||
return_value={
|
||||
"transactions": [
|
||||
{
|
||||
"transactionId": "TXN-1",
|
||||
"transactionDate": "2026-03-10T14:00:00Z",
|
||||
"storeNumber": "42",
|
||||
},
|
||||
{
|
||||
"transactionId": "TXN-2",
|
||||
"transactionDate": "2026-03-11T10:00:00Z",
|
||||
"storeNumber": "42",
|
||||
},
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
# First detail call raises exception, second succeeds
|
||||
mock_detail_fail = AsyncMock()
|
||||
mock_detail_fail.ok = False
|
||||
mock_detail_fail.status = 500
|
||||
|
||||
mock_detail_ok = AsyncMock()
|
||||
mock_detail_ok.ok = True
|
||||
mock_detail_ok.json = AsyncMock(return_value={"items": []})
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(
|
||||
side_effect=[mock_api_response, mock_detail_fail, mock_detail_ok]
|
||||
)
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
|
||||
# Both receipts should be returned — the first with empty detail
|
||||
assert len(receipts) == 2
|
||||
assert receipts[0].raw_data.get("detail") == {}
|
||||
assert receipts[1].receipt_id == "TXN-2"
|
||||
@@ -0,0 +1,364 @@
|
||||
"""Regression tests: scraper output matches expected schema.
|
||||
|
||||
Validates that parsed receipts from both Kroger and Meijer conform to the
|
||||
PurchaseCreate schema contract. Uses recorded fixtures to ensure outputs
|
||||
remain stable across code changes.
|
||||
"""
|
||||
|
||||
from decimal import Decimal
|
||||
|
||||
from receiptwitness.parsers.kroger import parse_kroger_receipt
|
||||
from receiptwitness.parsers.meijer import parse_meijer_receipt
|
||||
from receiptwitness.scrapers.base import RawReceipt
|
||||
|
||||
# Required top-level keys in a parsed receipt
|
||||
RECEIPT_REQUIRED_KEYS = {"receipt_id", "purchase_date", "total", "items", "raw_data"}
|
||||
RECEIPT_OPTIONAL_KEYS = {"subtotal", "tax", "savings_total", "source_url"}
|
||||
|
||||
# Required keys in each parsed item
|
||||
ITEM_REQUIRED_KEYS = {
|
||||
"product_name_raw",
|
||||
"upc",
|
||||
"quantity",
|
||||
"unit_price",
|
||||
"extended_price",
|
||||
}
|
||||
ITEM_OPTIONAL_KEYS = {
|
||||
"regular_price",
|
||||
"sale_price",
|
||||
"coupon_discount",
|
||||
"loyalty_discount",
|
||||
"category_raw",
|
||||
}
|
||||
|
||||
|
||||
def _validate_receipt_schema(result: dict) -> None:
|
||||
"""Assert that a parsed receipt dict conforms to the expected schema."""
|
||||
# All required keys present
|
||||
for key in RECEIPT_REQUIRED_KEYS:
|
||||
assert key in result, f"Missing required key: {key}"
|
||||
|
||||
# Types
|
||||
assert isinstance(result["receipt_id"], str)
|
||||
assert isinstance(result["purchase_date"], str)
|
||||
assert isinstance(result["total"], Decimal)
|
||||
assert isinstance(result["items"], list)
|
||||
assert isinstance(result["raw_data"], dict)
|
||||
|
||||
# Optional keys should be correct types when present
|
||||
if result.get("subtotal") is not None:
|
||||
assert isinstance(result["subtotal"], Decimal)
|
||||
if result.get("tax") is not None:
|
||||
assert isinstance(result["tax"], Decimal)
|
||||
if result.get("savings_total") is not None:
|
||||
assert isinstance(result["savings_total"], Decimal)
|
||||
if result.get("source_url") is not None:
|
||||
assert isinstance(result["source_url"], str)
|
||||
|
||||
# No unexpected keys
|
||||
all_keys = RECEIPT_REQUIRED_KEYS | RECEIPT_OPTIONAL_KEYS
|
||||
for key in result:
|
||||
assert key in all_keys, f"Unexpected key in receipt: {key}"
|
||||
|
||||
|
||||
def _validate_item_schema(item: dict) -> None:
|
||||
"""Assert that a parsed item dict conforms to the expected schema."""
|
||||
for key in ITEM_REQUIRED_KEYS:
|
||||
assert key in item, f"Missing required item key: {key}"
|
||||
|
||||
assert isinstance(item["product_name_raw"], str)
|
||||
assert len(item["product_name_raw"]) > 0
|
||||
assert isinstance(item["quantity"], Decimal)
|
||||
assert isinstance(item["unit_price"], Decimal)
|
||||
assert isinstance(item["extended_price"], Decimal)
|
||||
|
||||
# UPC can be None or str
|
||||
if item["upc"] is not None:
|
||||
assert isinstance(item["upc"], str)
|
||||
# UPC should not have leading zeros (stripped during parsing)
|
||||
assert not item["upc"].startswith("0"), f"UPC has leading zeros: {item['upc']}"
|
||||
|
||||
# Optional Decimal fields
|
||||
for opt_key in ("regular_price", "sale_price", "coupon_discount", "loyalty_discount"):
|
||||
if item.get(opt_key) is not None:
|
||||
assert isinstance(item[opt_key], Decimal), f"{opt_key} should be Decimal"
|
||||
|
||||
if item.get("category_raw") is not None:
|
||||
assert isinstance(item["category_raw"], str)
|
||||
|
||||
# No unexpected keys
|
||||
all_keys = ITEM_REQUIRED_KEYS | ITEM_OPTIONAL_KEYS
|
||||
for key in item:
|
||||
assert key in all_keys, f"Unexpected key in item: {key}"
|
||||
|
||||
|
||||
class TestKrogerSchemaValidation:
|
||||
def test_full_receipt_schema(self, kroger_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-2026-0312-4471",
|
||||
purchase_date="2026-03-12T16:45:00Z",
|
||||
store_number="00357",
|
||||
raw_data=kroger_receipt_data,
|
||||
source_url="https://www.kroger.com/atlas/v1/receipt/api?orderId=KR-2026-0312-4471",
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
_validate_receipt_schema(result)
|
||||
for item in result["items"]:
|
||||
_validate_item_schema(item)
|
||||
|
||||
def test_item_count_excludes_voided_and_returned(self, kroger_receipt_data):
|
||||
"""Fixture has 10 items, 2 should be excluded (voided + returned)."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-2026-0312-4471",
|
||||
purchase_date="2026-03-12T16:45:00Z",
|
||||
raw_data=kroger_receipt_data,
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert len(result["items"]) == 8
|
||||
|
||||
def test_totals_are_positive_decimals(self, kroger_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-2026-0312-4471",
|
||||
purchase_date="2026-03-12T16:45:00Z",
|
||||
raw_data=kroger_receipt_data,
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["total"] > Decimal("0")
|
||||
assert result["subtotal"] > Decimal("0")
|
||||
assert result["tax"] > Decimal("0")
|
||||
assert result["savings_total"] > Decimal("0")
|
||||
|
||||
def test_receipt_id_preserved(self, kroger_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-2026-0312-4471",
|
||||
purchase_date="2026-03-12T16:45:00Z",
|
||||
raw_data=kroger_receipt_data,
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
assert result["receipt_id"] == "KR-2026-0312-4471"
|
||||
|
||||
def test_known_product_prices(self, kroger_receipt_data):
|
||||
"""Verify specific products produce correct price extraction."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-2026-0312-4471",
|
||||
purchase_date="2026-03-12T16:45:00Z",
|
||||
raw_data=kroger_receipt_data,
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
items_by_name = {i["product_name_raw"]: i for i in result["items"]}
|
||||
|
||||
# Milk: $3.99, regular $4.29
|
||||
milk = items_by_name["KROGER WHOLE MILK GAL"]
|
||||
assert milk["unit_price"] == Decimal("3.99")
|
||||
assert milk["regular_price"] == Decimal("4.29")
|
||||
assert milk["sale_price"] == Decimal("3.99")
|
||||
|
||||
# Eggs: qty 2, $5.49 each, total $10.98
|
||||
eggs = items_by_name["SIMPLE TRUTH ORG EGGS 12CT"]
|
||||
assert eggs["quantity"] == Decimal("2")
|
||||
assert eggs["unit_price"] == Decimal("5.49")
|
||||
assert eggs["extended_price"] == Decimal("10.98")
|
||||
|
||||
# Deli turkey: weighted item, 0.68 lb
|
||||
turkey = items_by_name["KROGER DELI TURKEY BREAST"]
|
||||
assert turkey["quantity"] == Decimal("0.68")
|
||||
assert turkey["upc"] is None
|
||||
|
||||
def test_multi_quantity_item_correct(self, kroger_receipt_data):
|
||||
"""Pasta is qty=3, unit=$2.49, total=$7.47."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-2026-0312-4471",
|
||||
purchase_date="2026-03-12T16:45:00Z",
|
||||
raw_data=kroger_receipt_data,
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
pasta = [i for i in result["items"] if "PASTA" in i["product_name_raw"]][0]
|
||||
assert pasta["quantity"] == Decimal("3")
|
||||
assert pasta["unit_price"] == Decimal("2.49")
|
||||
assert pasta["extended_price"] == Decimal("7.47")
|
||||
|
||||
def test_coupon_discount_captured(self, kroger_receipt_data):
|
||||
"""Tide Pods has $2.00 coupon."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-2026-0312-4471",
|
||||
purchase_date="2026-03-12T16:45:00Z",
|
||||
raw_data=kroger_receipt_data,
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
tide = [i for i in result["items"] if "TIDE" in i["product_name_raw"]][0]
|
||||
assert tide["coupon_discount"] == Decimal("2.00")
|
||||
|
||||
|
||||
class TestMeijerSchemaValidation:
|
||||
def test_full_receipt_schema(self, meijer_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="TXN-2026-0310-001",
|
||||
purchase_date="2026-03-10T14:30:00Z",
|
||||
store_number="42",
|
||||
raw_data=meijer_receipt_data,
|
||||
source_url="https://www.meijer.com/bin/meijer/profile/receipt?receiptId=TXN-2026-0310-001",
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
_validate_receipt_schema(result)
|
||||
for item in result["items"]:
|
||||
_validate_item_schema(item)
|
||||
|
||||
def test_item_count_excludes_voided(self, meijer_receipt_data):
|
||||
"""Fixture has 6 items, 1 should be excluded (voided soda)."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="TXN-2026-0310-001",
|
||||
purchase_date="2026-03-10T14:30:00Z",
|
||||
raw_data=meijer_receipt_data,
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert len(result["items"]) == 5
|
||||
|
||||
def test_totals_are_positive_decimals(self, meijer_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="TXN-2026-0310-001",
|
||||
purchase_date="2026-03-10T14:30:00Z",
|
||||
raw_data=meijer_receipt_data,
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["total"] > Decimal("0")
|
||||
assert result["subtotal"] > Decimal("0")
|
||||
assert result["tax"] > Decimal("0")
|
||||
assert result["savings_total"] > Decimal("0")
|
||||
|
||||
def test_receipt_id_preserved(self, meijer_receipt_data):
|
||||
raw = RawReceipt(
|
||||
receipt_id="TXN-2026-0310-001",
|
||||
purchase_date="2026-03-10T14:30:00Z",
|
||||
raw_data=meijer_receipt_data,
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
assert result["receipt_id"] == "TXN-2026-0310-001"
|
||||
|
||||
def test_known_product_prices(self, meijer_receipt_data):
|
||||
"""Verify specific Meijer products produce correct price extraction."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="TXN-2026-0310-001",
|
||||
purchase_date="2026-03-10T14:30:00Z",
|
||||
raw_data=meijer_receipt_data,
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
items_by_name = {i["product_name_raw"]: i for i in result["items"]}
|
||||
|
||||
# Bananas: $0.69
|
||||
bananas = items_by_name["ORGANIC BANANAS"]
|
||||
assert bananas["unit_price"] == Decimal("0.69")
|
||||
assert bananas["mperks_discount"] if "mperks_discount" in bananas else True
|
||||
assert bananas["loyalty_discount"] == Decimal("0.10")
|
||||
|
||||
# Milk: qty 2, $3.49 each, total $6.98
|
||||
milk = items_by_name["MEIJER 2% MILK GAL"]
|
||||
assert milk["quantity"] == Decimal("2")
|
||||
assert milk["unit_price"] == Decimal("3.49")
|
||||
assert milk["extended_price"] == Decimal("6.98")
|
||||
|
||||
# Weighted deli turkey: 0.75 lb at $8.99/lb
|
||||
turkey = items_by_name["WEIGHTED DELI TURKEY"]
|
||||
assert turkey["quantity"] == Decimal("0.75")
|
||||
assert turkey["upc"] is None
|
||||
|
||||
def test_mperks_discount_captured(self, meijer_receipt_data):
|
||||
"""Paper towels has $1.00 mPerks discount."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="TXN-2026-0310-001",
|
||||
purchase_date="2026-03-10T14:30:00Z",
|
||||
raw_data=meijer_receipt_data,
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
towels = [i for i in result["items"] if "PAPER TOWELS" in i["product_name_raw"]][0]
|
||||
assert towels["loyalty_discount"] == Decimal("1.00")
|
||||
assert towels["coupon_discount"] == Decimal("1.00")
|
||||
|
||||
def test_cheerios_coupon_discount(self, meijer_receipt_data):
|
||||
"""Cheerios has $0.50 coupon."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="TXN-2026-0310-001",
|
||||
purchase_date="2026-03-10T14:30:00Z",
|
||||
raw_data=meijer_receipt_data,
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
cheerios = [i for i in result["items"] if "CHEERIOS" in i["product_name_raw"]][0]
|
||||
assert cheerios["coupon_discount"] == Decimal("0.50")
|
||||
|
||||
|
||||
class TestEmptyAndEdgeCaseSchemas:
|
||||
"""Regression tests for edge-case receipts that should not crash."""
|
||||
|
||||
def test_kroger_empty_receipt(self):
|
||||
raw = RawReceipt(receipt_id="KR-EMPTY", purchase_date="2026-03-12", raw_data={})
|
||||
result = parse_kroger_receipt(raw)
|
||||
_validate_receipt_schema(result)
|
||||
assert result["items"] == []
|
||||
assert result["total"] == Decimal("0")
|
||||
|
||||
def test_meijer_empty_receipt(self):
|
||||
raw = RawReceipt(receipt_id="MJ-EMPTY", purchase_date="2026-03-10", raw_data={})
|
||||
result = parse_meijer_receipt(raw)
|
||||
_validate_receipt_schema(result)
|
||||
assert result["items"] == []
|
||||
assert result["total"] == Decimal("0")
|
||||
|
||||
def test_kroger_receipt_no_detail(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-NODET",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={"total": 50.00},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
_validate_receipt_schema(result)
|
||||
assert result["items"] == []
|
||||
assert result["total"] == Decimal("50.00")
|
||||
|
||||
def test_meijer_receipt_no_detail(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="MJ-NODET",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={"total": 30.00},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
_validate_receipt_schema(result)
|
||||
assert result["items"] == []
|
||||
assert result["total"] == Decimal("30.00")
|
||||
|
||||
def test_kroger_receipt_all_voided(self):
|
||||
"""A receipt where every item is voided should have 0 items."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-ALLVOID",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [
|
||||
{"description": "VOIDED A", "basePrice": 5.0, "voided": True},
|
||||
{"description": "VOIDED B", "basePrice": 3.0, "status": "VOIDED"},
|
||||
{"description": "RETURNED C", "basePrice": 7.0, "status": "RETURNED"},
|
||||
{"description": "RETURNED D", "basePrice": 2.0, "returnFlag": True},
|
||||
],
|
||||
"total": 0,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_kroger_receipt(raw)
|
||||
_validate_receipt_schema(result)
|
||||
assert result["items"] == []
|
||||
|
||||
def test_meijer_receipt_all_voided(self):
|
||||
raw = RawReceipt(
|
||||
receipt_id="MJ-ALLVOID",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [
|
||||
{"description": "VOIDED A", "price": 5.0, "voided": True},
|
||||
{"description": "VOIDED B", "price": 3.0, "status": "VOIDED"},
|
||||
],
|
||||
"total": 0,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = parse_meijer_receipt(raw)
|
||||
_validate_receipt_schema(result)
|
||||
assert result["items"] == []
|
||||
@@ -0,0 +1,58 @@
|
||||
"""Tests for the base scraper class."""
|
||||
|
||||
from datetime import datetime
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData
|
||||
|
||||
|
||||
class ConcreteScraper(BaseScraper):
|
||||
"""Concrete implementation for testing the abstract base."""
|
||||
|
||||
async def login(self, username, password):
|
||||
return SessionData(
|
||||
cookies=[],
|
||||
user_agent="test",
|
||||
created_at=datetime.now(),
|
||||
)
|
||||
|
||||
async def check_session(self, session):
|
||||
return True
|
||||
|
||||
async def scrape_receipts(self, session, since=None):
|
||||
return []
|
||||
|
||||
def parse_receipt(self, raw):
|
||||
return {}
|
||||
|
||||
|
||||
class TestBaseScraper:
|
||||
@pytest.mark.asyncio
|
||||
async def test_human_delay_respects_bounds(self):
|
||||
scraper = ConcreteScraper()
|
||||
with patch("receiptwitness.scrapers.base.asyncio.sleep") as mock_sleep:
|
||||
mock_sleep.return_value = None
|
||||
await scraper.human_delay(min_ms=100, max_ms=200)
|
||||
call_args = mock_sleep.call_args[0][0]
|
||||
assert 0.1 <= call_args <= 0.2
|
||||
|
||||
def test_raw_receipt_dataclass(self):
|
||||
receipt = RawReceipt(
|
||||
receipt_id="test-123",
|
||||
purchase_date="2026-03-10",
|
||||
store_number="42",
|
||||
raw_data={"key": "value"},
|
||||
)
|
||||
assert receipt.receipt_id == "test-123"
|
||||
assert receipt.raw_data == {"key": "value"}
|
||||
|
||||
def test_session_data_defaults(self):
|
||||
session = SessionData(
|
||||
cookies=[],
|
||||
user_agent="test",
|
||||
created_at=datetime.now(),
|
||||
)
|
||||
assert session.expires_at is None
|
||||
assert session.extra == {}
|
||||
@@ -0,0 +1,574 @@
|
||||
"""Tests for the Kroger scraper.
|
||||
|
||||
These tests mock Playwright to avoid requiring real Kroger credentials
|
||||
or network access. They verify the scraper's control flow, session handling,
|
||||
date filtering, and error resilience.
|
||||
"""
|
||||
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from receiptwitness.scrapers.base import RawReceipt, SessionData
|
||||
from receiptwitness.scrapers.kroger import (
|
||||
DEFAULT_TIMEZONE,
|
||||
DEFAULT_USER_AGENT,
|
||||
DEFAULT_VIEWPORT,
|
||||
KROGER_BASE,
|
||||
KROGER_LOGIN_PAGE,
|
||||
KROGER_PURCHASE_HISTORY,
|
||||
KrogerScraper,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def scraper():
|
||||
return KrogerScraper()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def valid_session():
|
||||
return SessionData(
|
||||
cookies=[{"name": "session", "value": "abc123", "domain": ".kroger.com", "path": "/"}],
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
created_at=datetime.now(UTC),
|
||||
expires_at=datetime.now(UTC) + timedelta(hours=2),
|
||||
extra={"retailer": "kroger"},
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def expired_session():
|
||||
return SessionData(
|
||||
cookies=[{"name": "session", "value": "expired", "domain": ".kroger.com", "path": "/"}],
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
created_at=datetime.now(UTC) - timedelta(hours=4),
|
||||
expires_at=datetime.now(UTC) - timedelta(hours=2),
|
||||
)
|
||||
|
||||
|
||||
class TestKrogerScraperConstants:
|
||||
def test_base_url(self):
|
||||
assert KROGER_BASE == "https://www.kroger.com"
|
||||
|
||||
def test_login_page(self):
|
||||
assert KROGER_LOGIN_PAGE == "https://www.kroger.com/signin"
|
||||
|
||||
def test_purchase_history_page(self):
|
||||
assert KROGER_PURCHASE_HISTORY == "https://www.kroger.com/mypurchases"
|
||||
|
||||
def test_default_user_agent_is_chrome(self):
|
||||
assert "Chrome" in DEFAULT_USER_AGENT
|
||||
assert "Windows" in DEFAULT_USER_AGENT
|
||||
|
||||
def test_default_viewport_hd(self):
|
||||
assert DEFAULT_VIEWPORT == {"width": 1920, "height": 1080}
|
||||
|
||||
def test_default_timezone(self):
|
||||
assert DEFAULT_TIMEZONE == "America/New_York"
|
||||
|
||||
|
||||
class TestCheckSession:
|
||||
@pytest.mark.asyncio
|
||||
async def test_expired_session_returns_false(self, scraper, expired_session):
|
||||
result = await scraper.check_session(expired_session)
|
||||
assert result is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_expiry_checks_via_browser(self, scraper):
|
||||
session = SessionData(
|
||||
cookies=[],
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
created_at=datetime.now(UTC),
|
||||
expires_at=None,
|
||||
)
|
||||
mock_page = AsyncMock()
|
||||
mock_page.url = "https://www.kroger.com/account/dashboard"
|
||||
mock_response = MagicMock()
|
||||
mock_response.ok = True
|
||||
mock_page.goto = AsyncMock(return_value=mock_response)
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw:
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
result = await scraper.check_session(session)
|
||||
assert result is True
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_session_redirected_to_signin_returns_false(self, scraper):
|
||||
session = SessionData(
|
||||
cookies=[],
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
created_at=datetime.now(UTC),
|
||||
expires_at=None,
|
||||
)
|
||||
mock_page = AsyncMock()
|
||||
mock_page.url = "https://www.kroger.com/signin?redirectUrl=account"
|
||||
mock_response = MagicMock()
|
||||
mock_response.ok = True
|
||||
mock_page.goto = AsyncMock(return_value=mock_response)
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw:
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
result = await scraper.check_session(session)
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestLogin:
|
||||
@pytest.mark.asyncio
|
||||
async def test_login_returns_session_data(self, scraper):
|
||||
mock_page = AsyncMock()
|
||||
mock_page.url = "https://www.kroger.com/"
|
||||
|
||||
# Mock locator chain
|
||||
mock_email = AsyncMock()
|
||||
mock_password = AsyncMock()
|
||||
mock_button = AsyncMock()
|
||||
mock_page.locator = MagicMock(side_effect=[mock_email, mock_password, mock_button])
|
||||
mock_page.wait_for_url = AsyncMock()
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.cookies = AsyncMock(
|
||||
return_value=[
|
||||
{"name": "kroger_session", "value": "test123", "domain": ".kroger.com", "path": "/"}
|
||||
]
|
||||
)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
session = await scraper.login("user@test.com", "password123")
|
||||
|
||||
assert isinstance(session, SessionData)
|
||||
assert len(session.cookies) == 1
|
||||
assert session.cookies[0]["name"] == "kroger_session"
|
||||
assert session.user_agent == DEFAULT_USER_AGENT
|
||||
assert session.expires_at is not None
|
||||
assert session.extra == {"retailer": "kroger"}
|
||||
|
||||
|
||||
class TestScrapeReceipts:
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_returns_receipts(self, scraper, valid_session):
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = True
|
||||
mock_api_response.status = 200
|
||||
mock_api_response.json = AsyncMock(
|
||||
return_value={
|
||||
"orders": [
|
||||
{
|
||||
"orderId": "KR-001",
|
||||
"purchaseDate": "2026-03-10T14:00:00Z",
|
||||
"storeNumber": "357",
|
||||
},
|
||||
{
|
||||
"orderId": "KR-002",
|
||||
"purchaseDate": "2026-03-11T10:00:00Z",
|
||||
"storeNumber": "357",
|
||||
},
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
mock_detail_response = AsyncMock()
|
||||
mock_detail_response.ok = True
|
||||
mock_detail_response.json = AsyncMock(return_value={"items": []})
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(
|
||||
side_effect=[mock_api_response, mock_detail_response, mock_detail_response]
|
||||
)
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
|
||||
assert len(receipts) == 2
|
||||
assert receipts[0].receipt_id == "KR-001"
|
||||
assert receipts[1].receipt_id == "KR-002"
|
||||
assert isinstance(receipts[0], RawReceipt)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_filters_by_date(self, scraper, valid_session):
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = True
|
||||
mock_api_response.json = AsyncMock(
|
||||
return_value={
|
||||
"orders": [
|
||||
{
|
||||
"orderId": "KR-OLD",
|
||||
"purchaseDate": "2026-01-01T10:00:00Z",
|
||||
"storeNumber": "357",
|
||||
},
|
||||
{
|
||||
"orderId": "KR-NEW",
|
||||
"purchaseDate": "2026-03-15T10:00:00Z",
|
||||
"storeNumber": "357",
|
||||
},
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
mock_detail_response = AsyncMock()
|
||||
mock_detail_response.ok = True
|
||||
mock_detail_response.json = AsyncMock(return_value={})
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
since = datetime(2026, 3, 1, tzinfo=UTC)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session, since=since)
|
||||
|
||||
assert len(receipts) == 1
|
||||
assert receipts[0].receipt_id == "KR-NEW"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_handles_api_failure(self, scraper, valid_session):
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = False
|
||||
mock_api_response.status = 500
|
||||
mock_api_response.status_text = "Internal Server Error"
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(return_value=mock_api_response)
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
assert receipts == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_handles_unexpected_response(self, scraper, valid_session):
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = True
|
||||
mock_api_response.json = AsyncMock(return_value="not a dict")
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(return_value=mock_api_response)
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
assert receipts == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_alternative_field_names(self, scraper, valid_session):
|
||||
"""Kroger may use 'purchases' instead of 'orders'."""
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = True
|
||||
mock_api_response.json = AsyncMock(
|
||||
return_value={
|
||||
"purchases": [
|
||||
{
|
||||
"receiptId": "KR-ALT-001",
|
||||
"transactionDate": "2026-03-10T14:00:00Z",
|
||||
"divisionNumber": "014",
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
mock_detail_response = AsyncMock()
|
||||
mock_detail_response.ok = True
|
||||
mock_detail_response.json = AsyncMock(return_value={})
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
|
||||
assert len(receipts) == 1
|
||||
assert receipts[0].receipt_id == "KR-ALT-001"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_skips_orders_without_id(self, scraper, valid_session):
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = True
|
||||
mock_api_response.json = AsyncMock(
|
||||
return_value={
|
||||
"orders": [
|
||||
{"purchaseDate": "2026-03-10T14:00:00Z"}, # no id
|
||||
{"orderId": "KR-VALID", "purchaseDate": "2026-03-10T14:00:00Z"},
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
mock_detail_response = AsyncMock()
|
||||
mock_detail_response.ok = True
|
||||
mock_detail_response.json = AsyncMock(return_value={})
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
assert len(receipts) == 1
|
||||
assert receipts[0].receipt_id == "KR-VALID"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_skips_orders_with_null_id(self, scraper, valid_session):
|
||||
"""Ensure orderId: null doesn't produce receipt_id='None' (str(None) bug)."""
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = True
|
||||
mock_api_response.json = AsyncMock(
|
||||
return_value={
|
||||
"orders": [
|
||||
{"orderId": None, "receiptId": None, "purchaseDate": "2026-03-10T14:00:00Z"},
|
||||
{"orderId": "KR-REAL", "purchaseDate": "2026-03-10T14:00:00Z"},
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
mock_detail_response = AsyncMock()
|
||||
mock_detail_response.ok = True
|
||||
mock_detail_response.json = AsyncMock(return_value={})
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
assert len(receipts) == 1
|
||||
assert receipts[0].receipt_id == "KR-REAL"
|
||||
# Verify no receipt has the string "None" as its ID
|
||||
assert all(r.receipt_id != "None" for r in receipts)
|
||||
|
||||
|
||||
class TestParseReceipt:
|
||||
def test_parse_receipt_delegates_to_parser(self, scraper):
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-001",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [
|
||||
{
|
||||
"description": "TEST ITEM",
|
||||
"basePrice": 5.00,
|
||||
"totalPrice": 5.00,
|
||||
}
|
||||
],
|
||||
"total": 5.00,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = scraper.parse_receipt(raw)
|
||||
assert result["receipt_id"] == "KR-001"
|
||||
assert len(result["items"]) == 1
|
||||
|
||||
def test_receipt_detail_failure_returns_empty(self, scraper):
|
||||
"""Verify receipt detail failures produce empty detail."""
|
||||
raw = RawReceipt(
|
||||
receipt_id="KR-FAIL",
|
||||
purchase_date="2026-03-12",
|
||||
raw_data={"total": 10.00, "detail": {}},
|
||||
)
|
||||
result = scraper.parse_receipt(raw)
|
||||
assert result["receipt_id"] == "KR-FAIL"
|
||||
assert result["items"] == []
|
||||
@@ -0,0 +1,585 @@
|
||||
"""Tests for the Meijer scraper.
|
||||
|
||||
These tests mock Playwright to avoid requiring real Meijer credentials
|
||||
or network access. They verify the scraper's control flow, session handling,
|
||||
date filtering, and error resilience.
|
||||
"""
|
||||
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from receiptwitness.scrapers.base import RawReceipt, SessionData
|
||||
from receiptwitness.scrapers.meijer import (
|
||||
DEFAULT_TIMEZONE,
|
||||
DEFAULT_USER_AGENT,
|
||||
DEFAULT_VIEWPORT,
|
||||
MEIJER_BASE,
|
||||
MEIJER_LOGIN_PAGE,
|
||||
MEIJER_MPERKS_HOME,
|
||||
MEIJER_PURCHASE_HISTORY,
|
||||
MeijerScraper,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def scraper():
|
||||
return MeijerScraper()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def valid_session():
|
||||
return SessionData(
|
||||
cookies=[
|
||||
{"name": "meijer_session", "value": "abc123", "domain": ".meijer.com", "path": "/"}
|
||||
],
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
created_at=datetime.now(UTC),
|
||||
expires_at=datetime.now(UTC) + timedelta(hours=4),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def expired_session():
|
||||
return SessionData(
|
||||
cookies=[
|
||||
{"name": "meijer_session", "value": "expired", "domain": ".meijer.com", "path": "/"}
|
||||
],
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
created_at=datetime.now(UTC) - timedelta(hours=8),
|
||||
expires_at=datetime.now(UTC) - timedelta(hours=4),
|
||||
)
|
||||
|
||||
|
||||
class TestMeijerScraperConstants:
|
||||
def test_base_url(self):
|
||||
assert MEIJER_BASE == "https://www.meijer.com"
|
||||
|
||||
def test_login_page(self):
|
||||
assert MEIJER_LOGIN_PAGE == "https://www.meijer.com/shopping/login.html"
|
||||
|
||||
def test_mperks_home(self):
|
||||
assert MEIJER_MPERKS_HOME == "https://www.meijer.com/mperks.html"
|
||||
|
||||
def test_purchase_history_url(self):
|
||||
assert (
|
||||
MEIJER_PURCHASE_HISTORY == "https://www.meijer.com/bin/meijer/profile/purchasehistory"
|
||||
)
|
||||
|
||||
def test_default_user_agent_is_chrome(self):
|
||||
assert "Chrome" in DEFAULT_USER_AGENT
|
||||
assert "Windows" in DEFAULT_USER_AGENT
|
||||
|
||||
def test_default_viewport_hd(self):
|
||||
assert DEFAULT_VIEWPORT == {"width": 1920, "height": 1080}
|
||||
|
||||
def test_default_timezone(self):
|
||||
assert DEFAULT_TIMEZONE == "America/Detroit"
|
||||
|
||||
|
||||
class TestCheckSession:
|
||||
@pytest.mark.asyncio
|
||||
async def test_expired_session_returns_false(self, scraper, expired_session):
|
||||
result = await scraper.check_session(expired_session)
|
||||
assert result is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_expiry_checks_via_browser(self, scraper):
|
||||
session = SessionData(
|
||||
cookies=[],
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
created_at=datetime.now(UTC),
|
||||
expires_at=None,
|
||||
)
|
||||
mock_page = AsyncMock()
|
||||
mock_page.url = "https://www.meijer.com/mperks.html"
|
||||
mock_response = MagicMock()
|
||||
mock_response.ok = True
|
||||
mock_page.goto = AsyncMock(return_value=mock_response)
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw:
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
result = await scraper.check_session(session)
|
||||
assert result is True
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_session_redirected_to_login_returns_false(self, scraper):
|
||||
session = SessionData(
|
||||
cookies=[],
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
created_at=datetime.now(UTC),
|
||||
expires_at=None,
|
||||
)
|
||||
mock_page = AsyncMock()
|
||||
mock_page.url = "https://www.meijer.com/shopping/login.html?redirect=mperks"
|
||||
mock_response = MagicMock()
|
||||
mock_response.ok = True
|
||||
mock_page.goto = AsyncMock(return_value=mock_response)
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw:
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
result = await scraper.check_session(session)
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestLogin:
|
||||
@pytest.mark.asyncio
|
||||
async def test_login_returns_session_data(self, scraper):
|
||||
mock_page = AsyncMock()
|
||||
mock_page.url = "https://www.meijer.com/mperks.html"
|
||||
|
||||
# Mock locator chain
|
||||
mock_email = AsyncMock()
|
||||
mock_password = AsyncMock()
|
||||
mock_button = AsyncMock()
|
||||
mock_page.locator = MagicMock(side_effect=[mock_email, mock_password, mock_button])
|
||||
mock_page.wait_for_url = AsyncMock()
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.cookies = AsyncMock(
|
||||
return_value=[
|
||||
{"name": "meijer_session", "value": "test456", "domain": ".meijer.com", "path": "/"}
|
||||
]
|
||||
)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
session = await scraper.login("user@test.com", "password123")
|
||||
|
||||
assert isinstance(session, SessionData)
|
||||
assert len(session.cookies) == 1
|
||||
assert session.cookies[0]["name"] == "meijer_session"
|
||||
assert session.user_agent == DEFAULT_USER_AGENT
|
||||
assert session.expires_at is not None
|
||||
# Meijer sessions last 4 hours
|
||||
assert session.expires_at > session.created_at + timedelta(hours=3)
|
||||
|
||||
|
||||
class TestScrapeReceipts:
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_returns_receipts(self, scraper, valid_session):
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = True
|
||||
mock_api_response.status = 200
|
||||
mock_api_response.json = AsyncMock(
|
||||
return_value={
|
||||
"transactions": [
|
||||
{
|
||||
"transactionId": "TXN-001",
|
||||
"transactionDate": "2026-03-10T14:00:00Z",
|
||||
"storeNumber": "42",
|
||||
},
|
||||
{
|
||||
"transactionId": "TXN-002",
|
||||
"transactionDate": "2026-03-11T10:00:00Z",
|
||||
"storeNumber": "42",
|
||||
},
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
mock_detail_response = AsyncMock()
|
||||
mock_detail_response.ok = True
|
||||
mock_detail_response.json = AsyncMock(return_value={"items": []})
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(
|
||||
side_effect=[mock_api_response, mock_detail_response, mock_detail_response]
|
||||
)
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
|
||||
assert len(receipts) == 2
|
||||
assert receipts[0].receipt_id == "TXN-001"
|
||||
assert receipts[1].receipt_id == "TXN-002"
|
||||
assert isinstance(receipts[0], RawReceipt)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_filters_by_date(self, scraper, valid_session):
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = True
|
||||
mock_api_response.json = AsyncMock(
|
||||
return_value={
|
||||
"transactions": [
|
||||
{
|
||||
"transactionId": "TXN-OLD",
|
||||
"transactionDate": "2026-01-01T10:00:00Z",
|
||||
"storeNumber": "42",
|
||||
},
|
||||
{
|
||||
"transactionId": "TXN-NEW",
|
||||
"transactionDate": "2026-03-15T10:00:00Z",
|
||||
"storeNumber": "42",
|
||||
},
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
mock_detail_response = AsyncMock()
|
||||
mock_detail_response.ok = True
|
||||
mock_detail_response.json = AsyncMock(return_value={})
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
since = datetime(2026, 3, 1, tzinfo=UTC)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session, since=since)
|
||||
|
||||
assert len(receipts) == 1
|
||||
assert receipts[0].receipt_id == "TXN-NEW"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_handles_api_failure(self, scraper, valid_session):
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = False
|
||||
mock_api_response.status = 500
|
||||
mock_api_response.status_text = "Internal Server Error"
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(return_value=mock_api_response)
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
assert receipts == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_handles_unexpected_response(self, scraper, valid_session):
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = True
|
||||
mock_api_response.json = AsyncMock(return_value="not a dict")
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(return_value=mock_api_response)
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
assert receipts == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_alternative_field_names(self, scraper, valid_session):
|
||||
"""Meijer may use 'purchaseHistory' instead of 'transactions'."""
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = True
|
||||
mock_api_response.json = AsyncMock(
|
||||
return_value={
|
||||
"purchaseHistory": [
|
||||
{
|
||||
"receiptId": "MJ-ALT-001",
|
||||
"purchaseDate": "2026-03-10T14:00:00Z",
|
||||
"storeId": "99",
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
mock_detail_response = AsyncMock()
|
||||
mock_detail_response.ok = True
|
||||
mock_detail_response.json = AsyncMock(return_value={})
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
|
||||
assert len(receipts) == 1
|
||||
assert receipts[0].receipt_id == "MJ-ALT-001"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_skips_transactions_without_id(self, scraper, valid_session):
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = True
|
||||
mock_api_response.json = AsyncMock(
|
||||
return_value={
|
||||
"transactions": [
|
||||
{"transactionDate": "2026-03-10T14:00:00Z"}, # no id
|
||||
{"transactionId": "TXN-VALID", "transactionDate": "2026-03-10T14:00:00Z"},
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
mock_detail_response = AsyncMock()
|
||||
mock_detail_response.ok = True
|
||||
mock_detail_response.json = AsyncMock(return_value={})
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
assert len(receipts) == 1
|
||||
assert receipts[0].receipt_id == "TXN-VALID"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_receipt_detail_failure_returns_empty_detail(self, scraper, valid_session):
|
||||
"""Receipt detail API failure should not crash the scraper."""
|
||||
mock_api_response = AsyncMock()
|
||||
mock_api_response.ok = True
|
||||
mock_api_response.json = AsyncMock(
|
||||
return_value={
|
||||
"transactions": [
|
||||
{
|
||||
"transactionId": "TXN-DETAIL-FAIL",
|
||||
"transactionDate": "2026-03-10T14:00:00Z",
|
||||
"storeNumber": "42",
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
mock_detail_response = AsyncMock()
|
||||
mock_detail_response.ok = False
|
||||
mock_detail_response.status = 404
|
||||
|
||||
mock_request = AsyncMock()
|
||||
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_page.goto = AsyncMock()
|
||||
mock_page.request = mock_request
|
||||
|
||||
mock_context = AsyncMock()
|
||||
mock_context.new_page = AsyncMock(return_value=mock_page)
|
||||
mock_context.add_cookies = AsyncMock()
|
||||
mock_context.add_init_script = AsyncMock()
|
||||
mock_browser = AsyncMock()
|
||||
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
||||
mock_context.browser = mock_browser
|
||||
|
||||
mock_pw = AsyncMock()
|
||||
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
||||
|
||||
with (
|
||||
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
|
||||
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
||||
):
|
||||
mock_cm = AsyncMock()
|
||||
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
||||
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_apw.return_value = mock_cm
|
||||
|
||||
receipts = await scraper.scrape_receipts(valid_session)
|
||||
assert len(receipts) == 1
|
||||
assert receipts[0].receipt_id == "TXN-DETAIL-FAIL"
|
||||
assert receipts[0].raw_data.get("detail") == {}
|
||||
|
||||
|
||||
class TestParseReceipt:
|
||||
def test_parse_receipt_delegates_to_parser(self, scraper):
|
||||
raw = RawReceipt(
|
||||
receipt_id="TXN-001",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={
|
||||
"detail": {
|
||||
"items": [
|
||||
{
|
||||
"description": "TEST ITEM",
|
||||
"price": 5.00,
|
||||
"extendedPrice": 5.00,
|
||||
}
|
||||
],
|
||||
"total": 5.00,
|
||||
}
|
||||
},
|
||||
)
|
||||
result = scraper.parse_receipt(raw)
|
||||
assert result["receipt_id"] == "TXN-001"
|
||||
assert len(result["items"]) == 1
|
||||
|
||||
def test_receipt_detail_failure_returns_empty(self, scraper):
|
||||
raw = RawReceipt(
|
||||
receipt_id="TXN-FAIL",
|
||||
purchase_date="2026-03-10",
|
||||
raw_data={"total": 10.00, "detail": {}},
|
||||
)
|
||||
result = scraper.parse_receipt(raw)
|
||||
assert result["receipt_id"] == "TXN-FAIL"
|
||||
assert result["items"] == []
|
||||
@@ -0,0 +1,61 @@
|
||||
"""Tests for session encryption/decryption."""
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from cryptography.fernet import Fernet, InvalidToken
|
||||
|
||||
from receiptwitness.session.encryption import decrypt_session_data, encrypt_session_data
|
||||
|
||||
TEST_KEY = Fernet.generate_key().decode()
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _mock_encryption_key():
|
||||
with patch("receiptwitness.session.encryption.settings") as mock_settings:
|
||||
mock_settings.session_encryption_key = TEST_KEY
|
||||
yield
|
||||
|
||||
|
||||
class TestEncryptDecrypt:
|
||||
def test_roundtrip(self):
|
||||
data = {
|
||||
"cookies": [{"name": "session", "value": "abc123", "domain": ".meijer.com"}],
|
||||
"user_agent": "Mozilla/5.0",
|
||||
}
|
||||
encrypted = encrypt_session_data(data)
|
||||
assert isinstance(encrypted, str)
|
||||
assert encrypted != str(data)
|
||||
|
||||
decrypted = decrypt_session_data(encrypted)
|
||||
assert decrypted == data
|
||||
|
||||
def test_different_data_different_ciphertext(self):
|
||||
data1 = {"key": "value1"}
|
||||
data2 = {"key": "value2"}
|
||||
enc1 = encrypt_session_data(data1)
|
||||
enc2 = encrypt_session_data(data2)
|
||||
assert enc1 != enc2
|
||||
|
||||
def test_decrypt_with_wrong_key_fails(self):
|
||||
data = {"cookies": []}
|
||||
encrypted = encrypt_session_data(data)
|
||||
|
||||
wrong_key = Fernet.generate_key().decode()
|
||||
with patch("receiptwitness.session.encryption.settings") as mock_settings:
|
||||
mock_settings.session_encryption_key = wrong_key
|
||||
with pytest.raises(InvalidToken):
|
||||
decrypt_session_data(encrypted)
|
||||
|
||||
def test_decrypt_tampered_data_fails(self):
|
||||
data = {"cookies": []}
|
||||
encrypted = encrypt_session_data(data)
|
||||
tampered = encrypted[:-5] + "XXXXX"
|
||||
with pytest.raises(Exception):
|
||||
decrypt_session_data(tampered)
|
||||
|
||||
def test_no_key_raises_error(self):
|
||||
with patch("receiptwitness.session.encryption.settings") as mock_settings:
|
||||
mock_settings.session_encryption_key = ""
|
||||
with pytest.raises(ValueError, match="RW_SESSION_ENCRYPTION_KEY"):
|
||||
encrypt_session_data({"test": True})
|
||||
@@ -0,0 +1,102 @@
|
||||
"""Tests for session manager logic."""
|
||||
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
import pytest
|
||||
from cryptography.fernet import Fernet
|
||||
|
||||
from receiptwitness.scrapers.base import SessionData
|
||||
from receiptwitness.session.manager import (
|
||||
get_valid_session,
|
||||
session_from_db_record,
|
||||
session_to_db_value,
|
||||
)
|
||||
|
||||
TEST_KEY = Fernet.generate_key().decode()
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _mock_encryption_key():
|
||||
with patch("receiptwitness.session.encryption.settings") as mock_settings:
|
||||
mock_settings.session_encryption_key = TEST_KEY
|
||||
yield
|
||||
|
||||
|
||||
def _make_session(hours_until_expire: int = 4) -> SessionData:
|
||||
now = datetime.now(UTC)
|
||||
return SessionData(
|
||||
cookies=[{"name": "sid", "value": "test", "domain": ".meijer.com"}],
|
||||
user_agent="Mozilla/5.0",
|
||||
created_at=now,
|
||||
expires_at=now + timedelta(hours=hours_until_expire),
|
||||
)
|
||||
|
||||
|
||||
class TestSessionSerialization:
|
||||
def test_roundtrip(self):
|
||||
session = _make_session()
|
||||
db_value = session_to_db_value(session)
|
||||
restored = session_from_db_record(db_value)
|
||||
|
||||
assert restored is not None
|
||||
assert restored.cookies == session.cookies
|
||||
assert restored.user_agent == session.user_agent
|
||||
|
||||
def test_none_returns_none(self):
|
||||
assert session_from_db_record(None) is None
|
||||
|
||||
def test_invalid_encrypted_returns_none(self):
|
||||
assert session_from_db_record("garbage-data") is None
|
||||
|
||||
|
||||
class TestGetValidSession:
|
||||
@pytest.mark.asyncio
|
||||
async def test_valid_existing_session(self):
|
||||
session = _make_session()
|
||||
db_value = session_to_db_value(session)
|
||||
|
||||
scraper = AsyncMock()
|
||||
scraper.check_session.return_value = True
|
||||
|
||||
result, was_refreshed = await get_valid_session(scraper, db_value, "user", "pass")
|
||||
assert not was_refreshed
|
||||
assert result.cookies == session.cookies
|
||||
scraper.login.assert_not_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_expired_session_triggers_login(self):
|
||||
session = _make_session(hours_until_expire=-1) # already expired
|
||||
db_value = session_to_db_value(session)
|
||||
|
||||
new_session = _make_session()
|
||||
scraper = AsyncMock()
|
||||
scraper.login.return_value = new_session
|
||||
|
||||
result, was_refreshed = await get_valid_session(scraper, db_value, "user", "pass")
|
||||
assert was_refreshed
|
||||
scraper.login.assert_called_once_with("user", "pass")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_existing_session_triggers_login(self):
|
||||
new_session = _make_session()
|
||||
scraper = AsyncMock()
|
||||
scraper.login.return_value = new_session
|
||||
|
||||
result, was_refreshed = await get_valid_session(scraper, None, "user", "pass")
|
||||
assert was_refreshed
|
||||
scraper.login.assert_called_once()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_failed_session_check_triggers_login(self):
|
||||
session = _make_session()
|
||||
db_value = session_to_db_value(session)
|
||||
|
||||
new_session = _make_session()
|
||||
scraper = AsyncMock()
|
||||
scraper.check_session.return_value = False
|
||||
scraper.login.return_value = new_session
|
||||
|
||||
result, was_refreshed = await get_valid_session(scraper, db_value, "user", "pass")
|
||||
assert was_refreshed
|
||||
scraper.login.assert_called_once()
|
||||
Reference in New Issue
Block a user