Skip to content

Instantly share code, notes, and snippets.

@shantanusingh
Created January 20, 2012 11:28
Show Gist options
  • Save shantanusingh/1646874 to your computer and use it in GitHub Desktop.
Save shantanusingh/1646874 to your computer and use it in GitHub Desktop.
Scrapes Wikipedia(http://en.wikipedia.org/wiki/Bollywood_films_of_2012) to compile a list of movies(2012) as a JSON string
(function(){
var movs = new Array();
$.fn.textNodes = function()
{
function textNodeFiler(){
return this.nodeType == 3 || this.nodeName == "BR" ;
}
return $(this).contents().filter(textNodeFiler).add($(this).find('*').contents().filter(textNodeFiler));
};
function getStr(nodes){
var text = '';
nodes.each(function(){
/*console.log("this.text -- > " + $(this).text());
console.log("this -- > " + $(this));*/
text = text + $(this).text();
});
return text;
}
function isNumber(n) {
return !isNaN(parseFloat(n)) && isFinite(n);
}
var extractFunc = function(totalCells){
return function(z){
if(z==0)return;
var url,mName, genre, dir, act;
console.log('execing trs');
var startIdx = 0;
var spanArr = new Array();
var cells = $(this).children('td').size();
var spans = 0;
$(this).children('td').each(function(){
if($(this).attr('rowspan'))
spans++;
else if($(this).children('b').size() > 0)
spans++;
});
console.log('t -c - s' + totalCells +' ' + cells +' ' + spans);
startIdx = spans;
$(this).children('td').each(function(y){
console.log('rspan : ' + $(this).attr('rowspan'));
if( $(this).attr('rowspan') ){
console.log('has rowspan');
return;
}
console.log('start --> ' + startIdx);
/*console.log('style--> ' + y + 'for start : ' + startIdx);*/
switch(y){
case startIdx:
url = $(this).find('i>a').attr('href');
mName = $(this).find('i>a').text();
if(!mName){
mName = $(this).find('i').text();
}
break;
case (startIdx+1):
genre = getStr($(this).textNodes());
break;
case (startIdx+2):
dir=getStr($(this).textNodes());
break;
case (startIdx+3):
act=getStr($(this).textNodes());
break;
default:
}
});
var mv = {movie: mName, url: url, genre: genre, director: dir, actors: act};
console.log(mv);
movs.push(mv);
};
};
$('.wikitable').each(function(i){
console.log('Reading Table : ' + i);
if(i==0){
}
else if(i< 5){
console.log('execing else');
$(this).children('tbody').children('tr').each(
extractFunc(6)
);
}/*else{
console.log('execing else');
$(this).children('tbody').children('tr').each(
extractFunc(0)
);
}*/
else if(i==5){
console.log('execing else');
$(this).children('tbody').children('tr').each(
extractFunc(4)
);
}else{
/*$(this).children('tbody').children('tr').each(
extractFunc(0)
)*/
}
});
$.post('http://localhost:3000/',
JSON.stringify(movs),
function(){
console.log("post done!")
}, 'JSON')
.error(function(err){
console.log('Post errored out! - > ' + err);
});
}());
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment