Created
May 21, 2014 10:09
-
-
Save nyilmaz/7cb2c5238a0ad4da7adc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package match.mycron.web.job.crawler; | |
import com.google.common.collect.Maps; | |
import org.apache.http.client.methods.CloseableHttpResponse; | |
import org.apache.http.client.methods.HttpUriRequest; | |
import org.apache.http.client.methods.RequestBuilder; | |
import org.apache.http.impl.client.CloseableHttpClient; | |
import org.apache.http.impl.client.HttpClients; | |
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; | |
import org.apache.http.util.EntityUtils; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
import org.springframework.util.StringUtils; | |
import java.io.IOException; | |
import java.util.Map; | |
import java.util.concurrent.ExecutorService; | |
import java.util.concurrent.Executors; | |
import java.util.concurrent.TimeUnit; | |
/** | |
* @author nyilmaz | |
*/ | |
public class doktorsitesicrawler { | |
public static void main(String[] args) throws IOException, InterruptedException { | |
ExecutorService executorService = Executors.newFixedThreadPool(10); | |
PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(); | |
connectionManager.setDefaultMaxPerRoute(1000); | |
connectionManager.setMaxTotal(1000); | |
final CloseableHttpClient client = HttpClients.custom().setConnectionManager(connectionManager).build(); | |
final Map<String, String> doktormap = Maps.newConcurrentMap(); | |
StringBuffer stringBuffer = new StringBuffer(); | |
for(int i = 1; i < 18; i++) { | |
HttpUriRequest request = RequestBuilder | |
.get() | |
.setUri("http://www.doktorsitesi.com/tumuzmanlar") | |
.addParameter("sayfa", "" + i) | |
.build(); | |
CloseableHttpResponse response = client.execute(request); | |
String html = EntityUtils.toString(response.getEntity(), "ISO-8859-9"); | |
response.close(); | |
Document doc = Jsoup.parse(html); | |
Elements elems = doc.select(".wrapper ul li a"); | |
for(Element elem : elems) { | |
executorService.submit(new mythread(client, doktormap, elem, stringBuffer)); | |
} | |
} | |
executorService.shutdown(); | |
executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); | |
client.close(); | |
connectionManager.close(); | |
System.out.println(stringBuffer.toString()); | |
} | |
static class mythread extends Thread { | |
CloseableHttpClient client; | |
Map<String, String> doktormap; | |
Element elem; | |
StringBuffer stringBuffer; | |
mythread(CloseableHttpClient client, Map<String, String> doktormap, Element elem, StringBuffer stringBuffer) { | |
this.client = client; | |
this.doktormap = doktormap; | |
this.elem = elem; | |
this.stringBuffer = stringBuffer; | |
} | |
public void run() { | |
HttpUriRequest doktorreq = RequestBuilder.get().setUri("http://www.doktorsitesi.com" + elem.attr("href")).build(); | |
try { | |
CloseableHttpResponse doktorresp = client.execute(doktorreq); | |
String doktorhtml = EntityUtils.toString(doktorresp.getEntity(), "UTF-8"); | |
Document doktorsayfa = Jsoup.parse(doktorhtml); | |
String phone = doktorsayfa.select("[itemprop=telephone]").html(); | |
if(StringUtils.hasText(phone)) { | |
String address = doktorsayfa.select(".locality").html(); | |
String branch = doktorsayfa.select(".title").eq(0).html(); | |
synchronized (stringBuffer) { | |
String str = elem.html(); | |
stringBuffer | |
.append(convertUTF8(branch)) | |
.append(" - ") | |
.append(convertUTF8(str)) | |
.append(" - ") | |
.append(phone) | |
.append(" - ") | |
.append(convertUTF8(address)) | |
.append("\n"); | |
} | |
} | |
doktorresp.close(); | |
} catch(IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
} | |
private static String convertUTF8(String str) { | |
return str | |
.replace("Ü", "Ü") | |
.replace("ü", "ü") | |
.replace("Ö", "Ö") | |
.replace("ç", "ç") | |
.replace("Ç", "Ç") | |
.replace("ö", "ö"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment