from playwright.sync_api import sync_playwright, Playwright import os from config import load_env import re from datetime import date, datetime class Scraper: ACCOUNT_CONFIGS = [ { "account_key": "loan", "account_name": "emoney-loan", "balance_xpath": "xpath=/html/body/form/div[3]/div[3]/div[2]/div[3]/div[5]/span[2]", "link_xpath": "xpath=/html/body/form/div[3]/div[3]/div[2]/div[3]/div[1]/span[2]/a", }, { "account_key": "forrester", "account_name": "emoney-forrester", "balance_xpath": "xpath=/html/body/form/div[3]/div[3]/div[2]/div[1]/div[5]/span[2]", "link_xpath": "xpath=/html/body/form/div[3]/div[3]/div[2]/div[1]/div[1]/span[2]/a", }, { "account_key": "swift", "account_name": "emoney-swift", "balance_xpath": "xpath=/html/body/form/div[3]/div[3]/div[2]/div[2]/div[5]/span[2]", "link_xpath": "xpath=/html/body/form/div[3]/div[3]/div[2]/div[2]/div[1]/span[2]/a", }, ] TRANSACTIONS_TABLE_XPATH = "xpath=/html/body/form/div[3]/div[3]/div[2]/div/div[2]/div[3]/table/tbody" def __init__(self, playwright: Playwright, headless: bool = True): load_env("EmoneyScraper") self._require_env("SCRAPER_URL") self._require_env("SCRAPER_USERNAME") self._require_env("SCRAPER_PASSWORD") self.playwright = playwright self.firefox = self.playwright.firefox # or "firefox" or "webkit". self.browser = self.firefox.launch(headless=headless) self.page = self.browser.new_page() self.response = self.page.goto(os.getenv("SCRAPER_URL")) self.page.fill("input#ctl00_ContentPlaceHolder1_txtLoginID", os.getenv("SCRAPER_USERNAME")) self.page.fill("input#ctl00_ContentPlaceHolder1_txtPassword", os.getenv("SCRAPER_PASSWORD")) self.page.click("input#ctl00_ContentPlaceHolder1_btnLogin") def get_balance(self): balances: dict[str, str] = {} for account in self.ACCOUNT_CONFIGS: balance_text = self.page.locator(account["balance_xpath"]).inner_text() balances[account["account_key"]] = balance_text return balances def get_snapshot(self) -> dict[str, list[dict[str, object]]]: snapshot_date = date.today() balances = self.get_balance() accounts: list[dict[str, object]] = [] for account in self.ACCOUNT_CONFIGS: balance_text = balances.get(account["account_key"], "") if not balance_text: continue account_num = self.page.locator(account["link_xpath"]).inner_text().strip() balance_value = self._parse_money(balance_text) accounts.append({ "date": snapshot_date, "balance": balance_value, "account_name": account["account_name"], "account_num": account_num, "org_name": "Emoney", }) return { "accounts": accounts } def get_transactions(self): raw_sets: list[dict[str, object]] = [] for account in self.ACCOUNT_CONFIGS: account_num = self.page.locator(account["link_xpath"]).inner_text().strip() self.page.click(account["link_xpath"]) self.page.wait_for_selector(self.TRANSACTIONS_TABLE_XPATH) transaction_body = self.page.locator(self.TRANSACTIONS_TABLE_XPATH).inner_text() raw_sets.append({ "account": { **account, "account_num": account_num, }, "raw": transaction_body, }) self.page.go_back() self.page.wait_for_selector(account["link_xpath"]) return raw_sets def parse_transactions(self, raw: str | list[dict[str, object]] | None = None) -> list[dict[str, object]]: """ Parse raw transactions text into a list of dicts: [{'date': date, 'description': str, 'amount': float, 'balance': float}, ...] Lines containing 'INSUFF FUNDS' (case-insensitive) are ignored. If raw is None, get_transactions() is called to fetch the data. """ if raw is None: raw = self.get_transactions() or "" if isinstance(raw, list): combined: list[dict[str, object]] = [] for entry in raw: if not isinstance(entry, dict): continue entry_raw = entry.get("raw") or "" account_info = entry.get("account") if isinstance(entry.get("account"), dict) else None combined.extend(self._parse_transactions_for_account(entry_raw, account_info)) return combined return self._parse_transactions_for_account(raw) def _parse_transactions_for_account( self, raw: str, account_info: dict[str, object] | None = None, ) -> list[dict[str, object]]: lines = [ln.strip() for ln in raw.splitlines() if ln.strip()] pattern = re.compile( r'^(?P\d{2}-\d{2}-\d{4})\s+(?P.*?)\s+(?P[-\$\d,\.]+)\s+(?P[-\$\d,\.]+)\s*$' ) parsed: list[dict[str, object]] = [] for ln in lines: m = pattern.match(ln) if not m: # skip lines that don't match the expected pattern continue desc = m.group('desc').strip() if 'INSUFF FUNDS' in desc.upper(): continue def _to_float(s: str) -> float: return float(s.replace('$', '').replace(',', '').strip()) try: amount = _to_float(m.group('amount')) balance = _to_float(m.group('balance')) date_obj = datetime.strptime(m.group('date'), '%d-%m-%Y').date() except Exception: continue entry: dict[str, object] = { 'date': date_obj, 'description': desc, 'amount': amount, 'balance': balance, } if account_info: entry['account_name'] = account_info.get('account_name') entry['account_num'] = account_info.get('account_num') entry['org_name'] = "Emoney" parsed.append(entry) return parsed def get_transactions_parsed(self) -> list[dict[str, object]]: return self.parse_transactions() def close(self): self.browser.close() @staticmethod def _parse_money(value: str) -> float: return float(value.replace("$", "").replace(",", "").strip()) @staticmethod def _require_env(name: str) -> str: value = os.getenv(name) if not value: raise ValueError(f"Please set {name} in your environment.") return value #xpathbody=/html/body/form/div[3]/div[3]/div[2]/div/div[2]/div[3]/table/tbody #xpathaccountbutton = /html/body/form/div[3]/div[3]/div[2]/div[3]/div[1]/span[2]/a #xpath = /html/body/form/div[3]/div[3]/div[2]/div[3]/div[5]/span[2]