This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# scrape e-Compras GDF (https://www.compras.df.gov.br/) | |
import os | |
import requests | |
from bs4 import BeautifulSoup | |
baseurl = 'https://www.compras.df.gov.br/publico/' | |
basepath = '/Users/thiagomarzagao/Desktop/HTML/' | |
primeiro_id = 0 # ID of the first auction | |
ultimo_id = 48355 # ID of the last auction (as of 12/18/14) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
parse (HTML -> JSON) e-Compras GDF content | |
''' | |
import os | |
import re | |
import json | |
import socket | |
from bs4 import BeautifulSoup |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<meta charset="utf-8"> | |
<body> | |
<script src="http://d3js.org/d3.v3.min.js"></script> | |
<script src="http://d3js.org/topojson.v1.min.js"></script> | |
<script> | |
var width = 960, | |
height = 1160; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pickle | |
from sklearn.utils import shuffle | |
from sklearn import linear_model | |
from sklearn import cross_validation | |
# carrega X | |
with open('X.pkl', mode = 'rb') as fbuffer: | |
X = pickle.load(fbuffer) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import pickle | |
from nltk.stem import RSLPStemmer | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
def pre_process(): | |
vanilla = u'[^\u0041-\u005A \ | |
\u0061-\u007A \ | |
\u00C0-\u00D6 \ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pickle | |
import numpy as np | |
basepath = '/caminho/ate/CSVs/' # altere conforme necessario | |
flist = [fname for fname in os.listdir(basepath) if '.csv' in fname] | |
grupos = [] | |
classes = [] | |
counter = 0 | |
for fname in flist: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### WORDSCORES (LBG-2003) | |
### author: Thiago Marzagao | |
### contact: marzagao ddott 1 at osu ddott edu | |
import os | |
import numpy as np | |
import pandas as pd | |
ipath = '/Users/username/inputdata/' # folder containing the CSV files | |
opath = '/Users/username/outputdata/' # folder where output will be saved |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### FIGHTIN' WORDS (MCQ-2008) | |
### author: Thiago Marzagao | |
### contact: marzagao ddott 1 at osu ddott edu | |
import os | |
import sys | |
import pandas as pd | |
import numpy as np | |
from numpy import matrix as m | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<div class="highlight"><pre><code class="language-python" data-lang="python"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tm) | |
library(Matrix) | |
setwd('/Users/thiagomarzagao/Dropbox/dataScience/UnB-CIC/aulaText/') | |
comprasnet <- read.table('subset.csv', | |
stringsAsFactors = FALSE, | |
sep = ',', | |
nrows = 1000) | |
corpus <- Corpus(VectorSource(comprasnet$V2)) | |
corpus <- tm_map(corpus, PlainTextDocument) |