Last active
December 14, 2015 01:49
-
-
Save AlbertoMonteiro/5009212 to your computer and use it in GitHub Desktop.
Capturar todos os cnaes do site do governo.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using System.Net; | |
using System.Text.RegularExpressions; | |
using System.Threading.Tasks; | |
using HtmlAgilityPack; | |
namespace ConsoleApplication7 | |
{ | |
class Program | |
{ | |
private static StreamWriter stream; | |
static void Main(string[] args) | |
{ | |
var htmlDocument = RequestPage("http://www.cnae.ibge.gov.br/estrutura.asp?TabelaBusca=CNAE_201@CNAE%202.1%20-%20Subclasses@0@cnaefiscal@0"); | |
var linksDasSessoes = ProcuraA(htmlDocument.DocumentNode); | |
File.Delete("C:/cnaes.txt"); | |
stream = new StreamWriter("C:/cnaes.txt", true); | |
Parallel.ForEach(linksDasSessoes.Select(x => x.GetAttributeValue("href", "")).Distinct(), | |
s => | |
{ | |
var document = RequestPage("http://www.cnae.ibge.gov.br/" + s); | |
BuscaLinksGrupos(ProcuraA(document.DocumentNode)); | |
}); | |
} | |
private static HtmlDocument RequestPage(string url) | |
{ | |
HtmlDocument htmlDocument; | |
using (var webClient = new WebClient()) | |
{ | |
htmlDocument = new HtmlDocument(); | |
try | |
{ | |
htmlDocument.Load(webClient.OpenRead(url)); | |
} | |
catch (Exception) | |
{ | |
return new HtmlDocument(); | |
} | |
} | |
return htmlDocument; | |
} | |
private static void BuscaLinksSubClasses(IEnumerable<HtmlNode> linksDasClasses) | |
{ | |
Parallel.ForEach( | |
linksDasClasses.Where(x => x.InnerText.Length == 6).Select(x => x.GetAttributeValue("href", "")).Distinct(), | |
s => | |
{ | |
var document = RequestPage("http://www.cnae.ibge.gov.br/" + s); | |
foreach (var linksDasSubclass in ProcuraA(document.DocumentNode).Where(x => x.InnerText.Length == 9)) | |
{ | |
var parteLink = linksDasSubclass.GetAttributeValue("href", ""); | |
var subDocument = RequestPage("http://www.cnae.ibge.gov.br/" + parteLink); | |
var htmlNode = ProcuraB(subDocument.DocumentNode).Last(); | |
var htmlDecode = (htmlNode.InnerText); | |
lock(stream) | |
{ | |
stream.WriteLine("{0};{1}", linksDasSubclass.InnerHtml, htmlDecode); | |
stream.Flush(); | |
} | |
Console.WriteLine("Salvo: {0}", linksDasSubclass.InnerHtml); | |
} | |
}); | |
} | |
private static void BuscaLinksClasses(IEnumerable<HtmlNode> linksDosGrupos) | |
{ | |
Parallel.ForEach( | |
linksDosGrupos.Where(x => x.InnerText.Length == 3).Select(x => x.GetAttributeValue("href", "")).Distinct(), | |
s => | |
{ | |
var document = RequestPage("http://www.cnae.ibge.gov.br/" + s); | |
BuscaLinksSubClasses(ProcuraA(document.DocumentNode)); | |
}); | |
} | |
private static void BuscaLinksGrupos(IEnumerable<HtmlNode> linksDasDivisoes) | |
{ | |
Parallel.ForEach( | |
linksDasDivisoes.Where(x => x.InnerText.Length == 2).Select(x => x.GetAttributeValue("href", "")).Distinct(), | |
s => | |
{ | |
var document = RequestPage("http://www.cnae.ibge.gov.br/" + s); | |
BuscaLinksClasses(ProcuraA(document.DocumentNode)); | |
}); | |
} | |
private static IEnumerable<HtmlNode> ProcuraA(HtmlNode documentNode) | |
{ | |
return documentNode.SelectNodes("//a").Where(x => Regex.IsMatch(x.InnerText, @"^([\d-/. ]+)$")); | |
} | |
private static IEnumerable<HtmlNode> ProcuraB(HtmlNode documentNode) | |
{ | |
return documentNode.SelectNodes("//b"); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment