Last active
          June 23, 2017 23:06 
        
      - 
      
 - 
        
Save jjesusfilho/cb4a5ec91d6c7dbc23ca095cd054b8c5 to your computer and use it in GitHub Desktop.  
    scraper do Tribunal de Justiça do Rio Grande do Sul
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | library(httr) | |
| library(xml2) | |
| library(stringr) | |
| library(boilerpipeR) | |
| tjrsSG_meta<-function(BuscaLivre="",quote=TRUE){ | |
| if(quote==TRUE) BuscaLivre<-deparse(BuscaLivre) | |
| url<-"http://www.tjrs.jus.br/busca/search?" | |
| query<-list(q = "", proxystylesheet = "tjrs_index", getfields = "*", | |
| entsp = "a__politica-site", wc = "200", wc_mc = "1", oe = "UTF-8", | |
| ie = "UTF-8", ud = "1", sort = "date:D:S:d1", as_qj = "", | |
| as_epq = "", as_oq = "", as_eq = "", as_q = "", ulang = "en", | |
| ip = "", access = "p", entqr = "3", entqrm = "0", client = "tjrs_index", | |
| filter = "0", start = "0", aba = "juris", site = "juris") | |
| query[[1]]<-BuscaLivre | |
| query[[11]]<-BuscaLivre | |
| a<-url %>% | |
| GET(query=query) %>% | |
| content("parsed") | |
| num<-a %>% xml_find_all("//*[@class='clearfix left']/div/*[@class='bold'][3]") %>% | |
| xml_text() %>% | |
| as.numeric() | |
| url1<-a %>% xml_find_all("//*[@class='pagination-control']/a[1]/@href") %>% | |
| xml_text() %>% | |
| .[1] %>% | |
| paste0("http://www.tjrs.jus.br/busca/",.) | |
| df<-data.frame() | |
| for (i in seq(0,num,10)){ | |
| tryCatch({ | |
| url1<-str_replace(url1,"(?<=start\\=)\\d+",as.character(i)) | |
| b<-GET(url1) %>% | |
| content("parsed") | |
| processo<-xml_find_all(b,"//*[@class='featured font-size-12']") %>% xml_text() | |
| data.julgamento<- | |
| orgao.julgador<- xml_find_all(b,"//*[@class='larguraUltColuna']") %>% | |
| xml_text(trim=T) %>% | |
| str_replace(".*(:\\s)","") | |
| classe.processual<-xml_find_all(b,"//*[@id='table_resultado']//tr[2]/td[1]") %>% | |
| xml_text(trim=T) %>% | |
| str_replace(".*(:\\s)","") | |
| secao<-xml_find_all(b,"//*[@id='table_resultado']//tr[3]/td[2]") %>% | |
| xml_text(trim=T) %>% | |
| str_replace(".*(:\\s)","") | |
| relator<-xml_find_all(b,"//*[@id='table_resultado']//tr[5]/td[1]") %>% | |
| xml_text(trim=T) %>% | |
| str_replace(".*(:\\s)","") | |
| comarca.origem <-xml_find_all(b,"//*[@id='table_resultado']//tr[2]/td[2]") %>% | |
| xml_text(trim=T) %>% | |
| str_replace(".*(:\\s)","") | |
| classe.material<-xml_find_all(b,"//*[@id='table_resultado']//tr[4]/td[2]") %>% | |
| xml_text(trim=T) %>% | |
| str_replace(".*(:\\s)","") | |
| ementa<-xml_find_all(b,"//*[@class='ementa']") %>% | |
| xml_text(trim=T) %>% | |
| str_replace(".*(:\\s)","") | |
| linkHtml<-xml_find_all(b,"//*[@class='larguraPrimColuna']//a[2]/@href") %>% | |
| xml_text() %>% | |
| str_c("http://www.tjrs.jus.br/busca/",.) %>% | |
| str_replace_all("\\s+","%20") | |
| df1<-data.frame(processo,orgao.julgador,relator, classe.processual,classe.material,secao,comarca.origem,ementa, pagina=i,linkHtml,stringsAsFactors = F) | |
| df<-rbind(df,df1) | |
| }, error=function(m){ | |
| m | |
| }, finally={ | |
| next | |
| }) | |
| } | |
| return(df) | |
| } | |
| inteiroRS<-dfrs$linkHtml %>% | |
| map(function(x){ | |
| GET(x) %>% | |
| content("text") %>% | |
| DefaultExtractor() | |
| }) | |
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment