Created
November 30, 2013 15:47
-
-
Save danilovazb/7720655 to your computer and use it in GitHub Desktop.
Extrai CSV do IBGE
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/**************************************** | |
* Import de biblioteca para o projeto | |
****************************************/ | |
import java.io.BufferedInputStream; | |
import java.io.FileOutputStream; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.PrintWriter; | |
import java.net.SocketTimeoutException; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.text.SimpleDateFormat; | |
import java.util.Date; | |
import org.jsoup.HttpStatusException; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
public class downloadCSV { | |
/****************************************************** | |
* Metodo 1 - getLinksUF | |
* - Pega as URLs que contenha uf.php? na pagina principal do IBGE Cidades | |
******************************************************/ | |
public static void getLinksUF(String URL) throws IOException { | |
/**************************************************** | |
* Variaveis para gravação de log | |
*/ | |
FileWriter f = new FileWriter("logs.txt", true); | |
PrintWriter logMetd1 = new PrintWriter(f); | |
//*************************************************** | |
Document doc = Jsoup.connect(URL).get(); | |
Elements urlPesquisa = doc.select("a[href]"); | |
for (Element urlUF : urlPesquisa) { | |
if (urlUF.attr("href").contains("uf.php?") | |
&& !urlUF.attr("href").contains("home.php?lang=_EN") | |
&& !urlUF.attr("href").contains("home.php?lang=_ES") | |
&& !urlUF.attr("href").contains("home.php?lang=") | |
&& !urlUF.attr("href").contains("index.php?lang=")) | |
/* | |
* logMerd1 grava logs | |
*/ | |
System.out.println("*********************************************************************************"); | |
logMetd1.write("*********************************************************************************"); | |
System.out.println("Metodo 1 ---> " + urlUF.attr("abs:href")); | |
logMetd1.write("Metodo 1 ---> " + urlUF.attr("abs:href")); | |
getLinksCidades(urlUF.attr("abs:href")); | |
System.out.println("*********************************************************************************"); | |
logMetd1.write("*********************************************************************************"); | |
} | |
} | |
/****************************************************** | |
* Metodo 2 - getLinksUF | |
* - Pega as URLs que contenha perfil.php? na pagina de UF do IBGE Cidades | |
* - Elimina paginas que tenha lang=_ e /estadosat/perfil.php?lang=&sigla= | |
******************************************************/ | |
public static void getLinksCidades(String URLUF) throws IOException { | |
/**************************************************** | |
* Variaveis para gravação de log | |
*/ | |
FileWriter f = new FileWriter("logs.txt", true); | |
PrintWriter logMetd2 = new PrintWriter(f); | |
//*************************************************** | |
try{ | |
Document doc = Jsoup.connect(URLUF).get(); | |
Elements urlPesquisa = doc.select("a[href]"); | |
for (Element linkCid : urlPesquisa) { | |
if (linkCid.attr("href").contains("perfil.php?") | |
&& !linkCid.attr("href").contains("lang=_") | |
&& !linkCid.attr("href").contains("/estadosat/perfil.php?lang=&sigla=") | |
&& !linkCid.attr("href").contains("home.php?lang=_EN") | |
&& !linkCid.attr("href").contains("home.php?lang=_ES") | |
&& !linkCid.attr("href").contains("home.php?lang=") | |
&& !linkCid.attr("href").contains("index.php?lang=")) | |
System.out.println("Metodo 2 ---> " + linkCid.attr("abs:href")); | |
getLinksDados(linkCid.attr("abs:href")); | |
/* | |
* Grava logs de saída de comandos do módulo 3 | |
*/ | |
logMetd2.write("Metodo 2 ---> " + linkCid.attr("abs:href")); | |
} | |
}catch (SocketTimeoutException e) { | |
} | |
} | |
/****************************************************** | |
* Metodo 3 - getLinksUF | |
* - Pega as URLs que contenha temas.php?lang=&codmun= e &idtema=16&search= | |
******************************************************/ | |
public static void getLinksDados(String URLCID) throws IOException { | |
/**************************************************** | |
* Variaveis para gravação dos arquivos .csv | |
*/ | |
InputStream is = null; | |
BufferedInputStream buf = null; | |
FileOutputStream grava = null; | |
/**************************************************** | |
* Variaveis para gravação de log | |
*/ | |
FileWriter f = new FileWriter("logs.txt", true); | |
PrintWriter logMetd3 = new PrintWriter(f); | |
//*************************************************** | |
try{ | |
Document doc = Jsoup.connect(URLCID).get(); | |
Elements urlPesquisa = doc.select("a[href]"); | |
Elements titulo = doc.select(".csv"); | |
Elements estado = doc.select(".uf"); | |
Elements valor = doc.select("span[class=municipio titulo]"); | |
Elements linkSintese = doc.select("li.sintese"); | |
for (Element link : titulo) { | |
if (link.attr("href").contains("csv.php?lang=&idtema=16&codmun=") | |
&& !link.attr("href").contains("lang=_") | |
&& !link.attr("href").contains("/estadosat/perfil.php?lang=&sigla=") | |
&& !link.attr("href").contains("help.php?lang=") | |
&& !link.attr("href").contains("download/mapa_e_municipios.php?") | |
&& !link.attr("href").contains("/webcart") | |
&& !link.attr("href").contains("/home.php?lang=") | |
&& !link.attr("href").contains("/index.php?lang=") | |
&& !link.attr("href").contains("home.php?lang=_EN") | |
&& !link.attr("href").contains("home.php?lang=_ES") | |
&& !link.attr("href").contains("home.php?lang=") | |
&& !link.attr("href").contains("index.php?lang=")) | |
System.out.println("Metodo 3 ---> " + link.attr("abs:href")+"\n\nEstado: " + estado.text() + "\nCidade: " + valor.text() + "\nDocumento: " + link.text() + "\nLink Download: "+ link.attr("abs:href")); | |
if (link.attr("href").contains("csv.php?lang=&idtema=16&codmun=")){ | |
URL url = new URL(link.attr("abs:href")); | |
url.getHost(); | |
url.getFile(); | |
url.getPort(); | |
url.getUserInfo(); | |
URLConnection con = url.openConnection(); | |
buf = new BufferedInputStream(con.getInputStream()); | |
grava = new FileOutputStream("C:\\Users\\unknown\\Desktop\\Imagem\\" + estado.text() + " - " + valor.text() + " - " + link.text() + ".csv"); | |
int i = 0; | |
byte[] bytesIn = new byte[1024]; | |
while ((i = buf.read(bytesIn)) >= 0) { | |
grava.write(bytesIn, 0, i); | |
} | |
if (buf != null) { | |
buf.close(); | |
} | |
if (grava != null) { | |
grava.close(); | |
} | |
/* | |
* Grava logs de saída de comandos do módulo 3 | |
*/ | |
logMetd3.write("Metodo 3 ---> " + link.attr("abs:href")); | |
}} | |
}catch (SocketTimeoutException e) { | |
} | |
} | |
public static void getDados(String URLLD) throws IOException { | |
/**************************************************** | |
* Variaveis para gravação de log | |
*/ | |
FileWriter f = new FileWriter("logs.txt", true); | |
PrintWriter logMetd4 = new PrintWriter(f); | |
//*************************************************** | |
try{ | |
Document doc = Jsoup.connect(URLLD).get(); | |
Elements urlPesquisa = doc.select("a[href]"); | |
for (Element link : urlPesquisa) { | |
if (link.attr("href").contains("csv.php?lang=") | |
&& link.attr("href").contains("&idtema=16&search=") | |
&& !link.attr("href").contains("lang=_") | |
&& !link.attr("href").contains("/estadosat/perfil.php?lang=&sigla=") | |
&& !link.attr("href").contains("help.php?lang=") | |
&& !link.attr("href").contains("download/mapa_e_municipios.php?") | |
&& !link.attr("href").contains("/webcart") | |
&& !link.attr("href").contains("/home.php?lang=") | |
&& !link.attr("href").contains("/index.php?lang=") | |
&& !link.attr("href").contains("home.php?lang=_EN") | |
&& !link.attr("href").contains("home.php?lang=_ES") | |
&& !link.attr("href").contains("home.php?lang=") | |
&& !link.attr("href").contains("index.php?lang=")) | |
System.out.println("Metodo 4 ---> " + link.attr("abs:href")); | |
/* | |
* Grava logs de saída de comandos do módulo 3 | |
*/ | |
logMetd4.write("Metodo 4 ---> " + link.attr("abs:href")); | |
} | |
}catch (SocketTimeoutException e) { | |
} | |
} | |
/****************************************************** | |
* Metodo Main do programa | |
* @throws IOException | |
******************************************************/ | |
public static void main(String[] args) throws IOException { | |
// TODO Auto-generated method stub | |
getLinksUF("http://cidades.ibge.gov.br/xtras/home.php"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment