This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
sys.path.append(r'D:\jython2.7.0\Lib\site-packages') | |
from unidecode import unidecode | |
#TEST | |
value = "carette leuven" | |
with open(r"C:\Users\Boulot\Desktop\communes.tsv", 'r', encoding="utf8") as f: | |
lieux = [unidecode(name.strip().lower().replace("-", " ")) for name in f] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stringr) | |
library(purrr) | |
library(rvest) | |
#------------------------------------------------------------------------------# | |
# Author: Andrew Do | |
# Purpose: A bunch of utility functions for the main ScrapeCityToPage The goal | |
# is to be able to scrape up to a specified page number for a given city and | |
# then to store that information as a data frame. The resulting data frame will | |
# be raw and will require additional cleaning, but the structure is more or less |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! python3 | |
import json | |
import sys | |
import os | |
#prend en entrée un Json de "cluster and edit" et renvoye du code R | |
if len(sys.argv) < 2: | |
print("USAGE: ./utils/open_refine_to_R.py [edits.json] > r_file.R") | |
exit(1) | |
json_file = sys.argv[-1] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! python3 | |
import requests | |
from bs4 import BeautifulSoup | |
import string | |
import pandas as pd | |
url = "http://patrom.fltr.ucl.ac.be/contemporain/query.cfm" | |
letters = list(string.ascii_lowercase) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
data = set(row['record']['cells']['extract_persons']['value']) | |
liste = [] | |
for el in data: | |
liste.append('"%s"' %el) | |
terms = "+".join(liste) | |
if len(terms) > 1: | |
return "https://www.googleapis.com/books/v1/volumes?q=" + urllib.quote(terms.encode('utf8')) + "&key=" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import simplejson as json | |
import gzip | |
def getTargetIds(jsonData): | |
data = json.loads(jsonData) | |
return (str(data.get('id', 'null')), | |
str(data.get('norm_name', 'null')), | |
str(data.get('description', 'null')), | |
str(data.get('date_birth', 'null')), |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import copy | |
import os | |
import sys | |
import glob | |
os.chdir(r"FOLDER_PATH") | |
names={} | |
for fn in glob.glob('*.txt'): | |
with open(fn, encoding="utf8") as f: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import requests | |
from bs4 import BeautifulSoup | |
array = ["Q32815", "Q34627"] | |
query = {"query": """ | |
SELECT ?classe ?classeLabel WHERE { | |
wd:%s wdt:P279* ?classe . |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(jqr) | |
data <- readr::read_file("tweets.json") | |
data %>% keys() | |
data %>% jq("{id: .id, hashtag: .entities.hashtags[].text}", | |
"[.id, .hashtag]") %>% jsonlite::toJSON() | |
stri <- "--h" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Test du Stanford NER tagger avec les modèles CRF d'Europeana | |
entrainés sur des journaux : | |
http://lab.kbresearch.nl/static/html/eunews.html | |
La fonction est lente --> songer au multiprocessing | |
""" | |
from nltk.tag import StanfordNERTagger | |
from nltk.tokenize import word_tokenize |