Skip to content

Instantly share code, notes, and snippets.

@maggiben
Last active December 29, 2015 14:09
Show Gist options
  • Save maggiben/7682330 to your computer and use it in GitHub Desktop.
Save maggiben/7682330 to your computer and use it in GitHub Desktop.
NASDAQ100 Yahoo Scrapper
var fs = require("fs");
var index = 1;
var obj = [];
var lineNumber = 0;
fs.readFileSync('./Bovespa.csv').toString().split('\n').forEach(function (line) {
var array = line.split(";");
try {
var d = array[0].split("/");
var date = new Date(d[2], d[1], d[0]);
var last = array[1].replace(/\./g, "").replace(/,/g,".");
var open = array[2].replace(/\./g, "").replace(/,/g,".");
var diff = array[3].replace(/%/g, "").replace(/,/g, ".");
var max = array[4].replace(/\./g, "").replace(/,/g,".");
var min = array[5].replace(/\./g, "").replace(/,/g,".");
var volume = array[6].replace(/\./g, "").replace(/,/g,".");
}
catch(err) {
console.log("ln: ", lineNumber, "arr: ", array);
}
var data = {
date: d[2]+"/"+d[1]+"/"+d[0],
last: last,
open: open,
diff: diff,
max: max,
min: min,
volume: volume
};
obj.push(JSON.stringify(data));
//console.log(line);
/*fs.open("./output.txt", 'a', 0666, function(err, fd) {
fs.writeSync(fd, line.toString() + "\n", null, undefined, function(err, written) {
})});*/
lineNumber++;
});
// write JSON
var data = JSON.stringify(obj);
fs.writeFile('./Bovespa.json', obj, function (err) {
if (err) {
console.log('There has been an error saving your configuration data.');
console.log(err.message);
return;
}
console.log('Configuration saved successfully.')
});
console.log(obj[2]);
# Normalize dates for mongodb import
for file in $(ls ./stocks)
do
collection=$(echo ${file}|sed s/\.[^\.]*$//)
mongoimport --host linus.mongohq.com --port 10050 --username admin --password admin --db nasdaq100 --collection ${collection} --type csv --file ./stocks/${files} --headerline --upsert
done
db.aapl.find().forEach(function(doc) {
doc.Date=new Date(doc.Date.replace(/-/g,"\/"));
db.aapl.save(doc);
})
AAPL,ADBE,ADI,ADP,ADSK,AKAM,ALTR,ALXN,AMAT,AMGN,AMZN,ATVI,AVGO,BBBY,BIDU,BIIB,BRCM,CA,CELG,CERN,CHKP,CHRW,CHTR,CMCSA,COST,CSCO,CTRX,CTSH,CTXS,DISCA,DLTR,DTV,EBAY,EQIX,ESRX,EXPD,EXPE,FAST,FB,FFIV,FISV,FOSL,FOXA,GILD,GMCR,GOOG,GRMN,HSIC,INTC,INTU,ISRG,KLAC,KRFT,LBTYA,LINTA,LLTC,LMCA,MAR,MAT,MCHP,MDLZ,MNST,MSFT,MU,MXIM,MYL,NFLX,NTAP,NUAN,NVDA,ORLY,PAYX,PCAR,PCLN,QCOM,REGN,ROST,SBAC,SBUX,SHLD,SIAL,SIRI,SNDK,SPLS,SRCL,STX,SYMC,TSLA,TXN,VIAB,VIP,VOD,VRSK,VRTX,WDC,WFM,WYNN,XLNX,XRAY,YHOO
for file in $(ls ./stocks.org)
do
awk -F"," '{
if(NR <= 1) {
print $0
} else {
print "ISODate("$1"),"$2","$3","$4","$5","$6","$7
}
}' ./stocks.org/$file > ./stocks/$file
done
#!/bin/sh
# variables
SCRAPSDIR=./stocks
MONGOHOST="localhost"
MONGOPORT="27017"
MONGOUSER="admin"
MONGOPASS="admin"
MONGOBASE="nasdaq100"
# Help & Usage
function show_help {
echo "usage: --host <host> --port <port> --db <database>"
exit 0
}
# Remove normalized scraps
rm $SCRAPSDIR.norm/*
for file in $(ls ./$SCRAPSDIR)
do
awk -F',' '{
if(NR <= 1) {
print $0
} else {
old = $1
gsub(/-/, "/", $1)
printf("%s,%s,%s,%s,%s,%s,%s\n", $1, $2, $3, $4, $5, $6, $7)
}
}' ./$SCRAPSDIR/$file > ./$SCRAPSDIR.norm/$file
done
# Point scraps to normalized files
SCRAPSDIR=./stocks.norm
# Import them to mongo
for file in $(ls ./$SCRAPSDIR)
do
# make collection name from filename
collection=$(echo ${file} | sed s/\.[^\.]*$// | awk '{print tolower($0)}')
# import to mongodb
mongoimport --host $MONGOHOST --port $MONGOPORT --username $MONGOUSER --password $MONGOPASS --db $MONGOBASE --collection ${collection} --type csv --file ./$SCRAPSDIR/${file} --headerline --drop
# fix data types
mongo $MONGOHOST:$MONGOPORT/$MONGOBASE -u $MONGOUSER -p $MONGOPASS --eval "db.${collection}.find().forEach(function(doc){
doc.Date = new Date(doc.Date)
printjson(doc.Date)
})"
done
#EOF
for tkr in $(cat nasdaq100)
do
echo ${tkr}
curl "http://ichart.finance.yahoo.com/table.csv?s=${tkr}&a=00&b=1&c=1980&d=10&e=27&f=2013&g=d&ignore=.csv" > ./stocks/${tkr}.csv
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment