This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import re | |
try: | |
from urllib2 import urlopen | |
except ImportError: | |
from urllib.request import urlopen # py3k | |
def scrapeNOAA(url): | |
soup = BeautifulSoup(urlopen(url)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define here the models for your scraped items | |
# | |
# See documentation in: | |
# http://doc.scrapy.org/en/latest/topics/items.html | |
from scrapy.item import Item, Field | |
class SepPdfItem(Item): | |
# define the fields for your item here like: | |
# name = Field() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
################################################################################################ | |
#Autor: mekler | |
# | |
#Instrucciones de instalacion | |
#sudo apt-get php5-cli | |
# | |
#ĺinea de comando | |
# python wef-hack_2008.py 5626281-Financial-Development-Report-2008/ $(ls -1 5626281-Financial-Development-Report-2008/ | grep .pdf$) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
import pycurl | |
import StringIO | |
import sys | |
import ast | |
import pymongo | |
from pymongo import MongoClient | |
def construyeSQL(cadena): | |
aux = cadena.split(' ') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define here the models for your scraped items | |
# | |
# See documentation in: | |
# http://doc.scrapy.org/en/latest/topics/items.html | |
from scrapy.item import Item, Field | |
class NewDataDownloaderItem(Item): | |
# define the fields for your item here like: | |
# name = Field() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// Defining the basic cURL function | |
function curl($url, $postArray=NULL, $srcPage=NULL) { | |
$postFields=""; | |
$ch = curl_init($url); // Initialising cURL | |
curl_setopt($ch, CURLOPT_URL, $url); // Setting cURL's URL option with the $url variable passed into the function | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); // Setting cURL's option to return the webpage data | |
curl_setopt($ch, CURLOPT_ENCODING, 'identity'); | |
curl_setopt($ch, CURLOPT_COOKIEJAR, 'cookie.txt'); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define here the models for your scraped items | |
# | |
# See documentation in: | |
# http://doc.scrapy.org/en/latest/topics/items.html | |
from scrapy.item import Item, Field | |
class SnieSepItem(Item): | |
# define the fields for your item here like: | |
# name = Field() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.contrib.spiders import CrawlSpider, Rule | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from scrapy.selector import HtmlXPathSelector | |
from diputados.items import DiputadosItem | |
from time import time | |
import re | |
class DiputadosSpider(CrawlSpider): | |
name = 'diputados' |