Last active
January 19, 2019 17:42
-
-
Save simonmichael/3b32b660cc497a493da6b6ba04141b21 to your computer and use it in GitHub Desktop.
A WIP Wells Fargo bank scraper in haskell that I couldn't get working reliably (2016). If you do, please let me know!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env stack | |
{- stack exec | |
--verbosity info | |
--package webdriver | |
-- ghc | |
-} | |
--package data-default | |
-- stack exec ghci wellsfargo | |
-- stack exec ghc wellsfargohs | |
-- | |
-- https://muehe.org/posts/loading-chase-transactions-into-ledger/ | |
{-# LANGUAGE OverloadedStrings #-} | |
{-# LANGUAGE TupleSections #-} | |
{-# LANGUAGE BangPatterns #-} | |
import Control.Concurrent | |
import Control.Exception | |
import Control.Monad | |
import Control.Monad.IO.Class | |
import Data.Aeson (FromJSON) | |
import qualified Data.ByteString.Lazy as B | |
-- import Data.Default | |
import Data.List | |
import Data.Maybe | |
import qualified Data.Text as T | |
import Data.Time | |
import System.Environment | |
import System.IO | |
import Test.WebDriver | |
import Test.WebDriver.Commands.Wait | |
import Test.WebDriver.JSON (ignoreReturn) | |
config = | |
-- useBrowser (Firefox | |
-- Nothing | |
-- def -- http://hackage.haskell.org/package/webdriver-0.8.4/docs/Test-WebDriver-Capabilities.html#t:LogLevel | |
-- Nothing | |
-- -- (Just "/Applications/Firefox.app/Contents/MacOS/firefox") | |
-- ) | |
defaultConfig | |
{wdHistoryConfig=noHistory} | |
waitTime = 20 | |
-- :m Test.WebDriver Test.WebDriver.Session | |
-- s <- runSession defaultConfig $ getSession | |
-- -- getSession from Test.WebDriver.Session simply returns the existing session stored in the state monad. Once you have a session, you can then run commands on it like this: | |
-- runWD s $ openPage "http://example.com" | |
-- -- runSession creates a new WDSession using the given WDConfig, whereas runWD takes an existing WDSession reference and runs commands with that | |
main = do | |
args <- getArgs | |
let !acctno = read $ head args | |
hPutStr stderr "Username: " >> hFlush stderr | |
u <- getLine | |
hPutStr stderr "Password: " >> hFlush stderr | |
p <- getLine | |
-- let (u,p) = ("X", "X") | |
iolog "starting browser.." | |
csv <- runSession config | |
-- . finallyClose | |
$ do | |
login (T.pack u) (T.pack p) | |
accountDetail acctno | |
accountDownload | |
downloadCsv | |
putStrLn $ chomp csv | |
-- --> login page --> account summary page | |
login u p = do | |
wdlog "loading login page.." | |
openPage "https://connect.secure.wellsfargo.com/auth/login/present" | |
findElem ( ByCSS "input[id=j_username]" ) >>= sendKeys u | |
findElem ( ByCSS "input[id=j_password]" ) >>= sendKeys p | |
findElem ( ByCSS "form[id=Signon]" ) >>= submit | |
-- account summary page --> account detail page | |
accountDetail :: Int -> WD () | |
accountDetail n = do | |
wdlog "loading account summary page.." | |
accountlinks <- waitUntil waitTime $ do | |
es <- findElems (ByCSS "a.account-title-group") | |
-- wdlog $ "accountlinks: " ++ show es | |
expect $ length es == 5 | |
return es | |
accountlinks' <- | |
if (length accountlinks < 5) | |
then do | |
wdlog $ "Warning: waiting for the account links failed: " ++ show accountlinks | |
waitUntil waitTime $ | |
findElems (ByCSS "a.account-title-group") >>= expect . ((==5).length) | |
findElems (ByCSS "a.account-title-group") | |
else return accountlinks | |
-- wdlog $ "accountlinks': " ++ show accountlinks' | |
let l = accountlinks' !! (n-1) | |
Just u <- l `attr` "data-url" | |
wdlog $ "loading account detail page.." ++ T.unpack u | |
-- XXX sometimes no effect, try a delay | |
liftIO $ threadDelay 1000000 | |
wfNavigate u "page" | |
-- account detail page --> download page | |
accountDownload :: WD () | |
accountDownload = do | |
-- wdlog "waiting for download link.." | |
-- XXX sometimes times out waiting for Download Account Activity, doesn't leave account summary page | |
-- l <- waitUntil waitTime $ findElem (ByLinkText "Download Account Activity") | |
-- `onTimeout` do | |
-- writeScreenshot "screen.png" | |
-- failedCommand Timeout "timed out waiting for download link, check screenshot" | |
l <- waitUntil waitTime $ do | |
-- writeScreenshot "screen.png" | |
findElem (ByLinkText "Download Account Activity") | |
Just u <- l `attr` "data-url" | |
wdlog "loading account download page.." | |
wfNavigate u "saml" | |
-- download page --> download CSV file via ajax & return content | |
downloadCsv :: WD String | |
downloadCsv = do | |
wdlog "waiting for CSV option.." | |
waitUntil waitTime $ do | |
e <- findElem (ByCSS "input[id=commaDelimited]") | |
wdlog "clicking CSV option.." | |
click e | |
wdlog "waiting for CSV option to be checked.." | |
waitUntil waitTime $ do | |
es <- findElems (ByCSS "input[id=commaDelimited]:checked") | |
-- wdlog $ "checked: " ++ show es | |
expect $ not $ null es | |
-- credit card accounts don't set a default date range and allow up to 120 days, set that for all | |
let | |
days = 120 | |
fmt = T.pack . formatTime defaultTimeLocale "%m/%d/%y" | |
wdlog $ "setting date range to last "++show days++" days.." | |
todate <- liftIO getCurrentDay | |
let fromdate = addDays (-days) todate | |
findElem (ByCSS "input[id=fromDate]") >>= sendKeys (fmt fromdate) | |
findElem (ByCSS "input[id=toDate]") >>= sendKeys (fmt todate) | |
wdlog "downloading CSV.." | |
wfSubmitDownloadForm | |
wfNavigate :: T.Text -> String -> WD () | |
wfNavigate url displaytype = do | |
waitForJquery | |
ignoreReturn $ executeJS [JSArg url, JSArg displaytype] $ T.unlines | |
["$(document).trigger('mwfNavigation', {" | |
," url: arguments[0]," | |
," displayType: arguments[1]," | |
," targetElement: '.page'" | |
,"})" | |
] | |
wfSubmitDownloadForm :: WD String | |
wfSubmitDownloadForm = do | |
waitForJquery | |
executeJS [] $ T.unlines | |
["var $f = $('form[id=accountActivityDownloadModel]');" | |
,"var jqxhr = $.ajax({" | |
," async: false," | |
," type: 'POST'," | |
," url: $f.attr('action')," | |
," data: $f.serialize() + '&Download=Download'," | |
-- avoid WF's jsonDataFilter | |
," global: false," | |
," dataFilter: function(data,type){ return data; }," | |
,"});" | |
,"return jqxhr.responseText;" | |
] | |
-- in browser console: | |
-- $.ajax({'async':false, 'type':'POST', 'url':$f.attr('action'), 'data':$f.serialize()+'&Download=Download', 'global':false }) | |
-- download binary file: http://jsfiddle.net/3kUXy/ | |
-- TODO handle html response containing "There is no Account Activity information available for this account." | |
waitForJquery :: WD () | |
waitForJquery = do | |
-- wdlog "waiting for jQuery.." | |
waitUntil waitTime $ catchFailedCommand JavascriptError $ ignoreReturn (executeJS [] "jQuery") | |
iolog :: String -> IO () | |
iolog s = hPutStrLn stderr s >> hFlush stderr | |
wdlog :: String -> WD () | |
wdlog = liftIO . iolog | |
p :: Show a => a -> WD () | |
p = liftIO . print | |
printUrl :: WD () | |
printUrl = getCurrentURL >>= liftIO . putStrLn | |
wait :: Double -> WD () | |
wait secs = liftIO $ threadDelay (round $ secs * 1000000) | |
writeScreenshot :: FilePath -> WD () | |
writeScreenshot f = screenshot >>= liftIO . B.writeFile f | |
-- | Remove leading and trailing whitespace. | |
strip :: String -> String | |
strip = lstrip . rstrip | |
-- | Remove leading whitespace. | |
lstrip :: String -> String | |
lstrip = dropWhile (`elem` (" \t"::String)) :: String -> String -- XXX isSpace ? | |
-- | Remove trailing whitespace. | |
rstrip :: String -> String | |
rstrip = reverse . lstrip . reverse | |
-- | Remove trailing newlines/carriage returns. | |
chomp :: String -> String | |
chomp = reverse . dropWhile (`elem` ("\r\n"::String)) . reverse | |
getCurrentDay :: IO Day | |
getCurrentDay = do | |
t <- getZonedTime | |
return $ localDay (zonedTimeToLocalTime t) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment