AkahuSync/EmoneyScraper/scraper.py

104 lines
4.1 KiB
Python
Executable file

from playwright.sync_api import sync_playwright, Playwright
import os
from config import load_env
import re
from datetime import date, datetime
class Scraper:
def __init__(self, playwright: Playwright, headless: bool = True):
load_env("EmoneyScraper")
self._require_env("SCRAPER_URL")
self._require_env("SCRAPER_USERNAME")
self._require_env("SCRAPER_PASSWORD")
self.playwright = playwright
self.firefox = self.playwright.firefox # or "firefox" or "webkit".
self.browser = self.firefox.launch(headless=headless)
self.page = self.browser.new_page()
self.response = self.page.goto(os.getenv("SCRAPER_URL"))
self.page.fill("input#ctl00_ContentPlaceHolder1_txtLoginID", os.getenv("SCRAPER_USERNAME"))
self.page.fill("input#ctl00_ContentPlaceHolder1_txtPassword", os.getenv("SCRAPER_PASSWORD"))
self.page.click("input#ctl00_ContentPlaceHolder1_btnLogin")
def get_balance(self):
current_balance = self.page.locator("xpath=/html/body/form/div[3]/div[3]/div[2]/div[3]/div[5]/span[2]").inner_text()
return current_balance
def get_snapshot(self) -> dict[str, list[dict[str, object]]]:
balance_text = self.get_balance()
balance_value = self._parse_money(balance_text)
snapshot_date = date.today()
return {
"accounts": [
{
"date": snapshot_date,
"balance": balance_value,
}
]
}
def get_transactions(self):
self.page.click("xpath=/html/body/form/div[3]/div[3]/div[2]/div[3]/div[1]/span[2]/a")
transaction_body = self.page.locator("xpath=/html/body/form/div[3]/div[3]/div[2]/div/div[2]/div[3]/table/tbody").inner_text()
return transaction_body
def parse_transactions(self, raw: str = None) -> list[dict[str, any]]:
"""
Parse raw transactions text into a list of dicts:
[{'date': date, 'description': str, 'amount': float, 'balance': float}, ...]
Lines containing 'INSUFF FUNDS' (case-insensitive) are ignored.
If raw is None, get_transactions() is called to fetch the data.
"""
if raw is None:
raw = self.get_transactions() or ""
lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
pattern = re.compile(
r'^(?P<date>\d{2}-\d{2}-\d{4})\s+(?P<desc>.*?)\s+(?P<amount>[-\$\d,\.]+)\s+(?P<balance>[-\$\d,\.]+)\s*$'
)
parsed: list[dict[str, any]] = []
for ln in lines:
m = pattern.match(ln)
if not m:
# skip lines that don't match the expected pattern
continue
desc = m.group('desc').strip()
if 'INSUFF FUNDS' in desc.upper():
continue
def _to_float(s: str) -> float:
return float(s.replace('$', '').replace(',', '').strip())
try:
amount = _to_float(m.group('amount'))
balance = _to_float(m.group('balance'))
date_obj = datetime.strptime(m.group('date'), '%d-%m-%Y').date()
except Exception:
continue
parsed.append({
'date': date_obj,
'description': desc,
'amount': amount,
'balance': balance,
})
return parsed
def get_transactions_parsed(self) -> list[dict[str, any]]:
raw = self.get_transactions()
return self.parse_transactions(raw)
def close(self):
self.browser.close()
@staticmethod
def _parse_money(value: str) -> float:
return float(value.replace("$", "").replace(",", "").strip())
@staticmethod
def _require_env(name: str) -> str:
value = os.getenv(name)
if not value:
raise ValueError(f"Please set {name} in your environment.")
return value
#xpathbody=/html/body/form/div[3]/div[3]/div[2]/div/div[2]/div[3]/table/tbody
#xpathaccountbutton = /html/body/form/div[3]/div[3]/div[2]/div[3]/div[1]/span[2]/a
#xpath = /html/body/form/div[3]/div[3]/div[2]/div[3]/div[5]/span[2]