Skip to content

Instantly share code, notes, and snippets.

@sheimi
Created November 9, 2014 05:27
Show Gist options
  • Save sheimi/bbf5a67c0d6ad2e4869e to your computer and use it in GitHub Desktop.
Save sheimi/bbf5a67c0d6ad2e4869e to your computer and use it in GitHub Desktop.
code in blog.sheimi.me: 2012-05-17-source-code-02 (1) injector
if (LOG.isInfoEnabled()) {
LOG.info("Injector: Converting injected urls to crawl db entries.");
}
JobConf sortJob = new NutchJob(getConf());
sortJob.setJobName("inject " + urlDir);
FileInputFormat.addInputPath(sortJob, urlDir);
sortJob.setMapperClass(InjectMapper.class);
FileOutputFormat.setOutputPath(sortJob, tempDir);
sortJob.setOutputFormat(SequenceFileOutputFormat.class);
sortJob.setOutputKeyClass(Text.class);
sortJob.setOutputValueClass(CrawlDatum.class);
sortJob.setLong("injector.current.time", System.currentTimeMillis());
JobClient.runJob(sortJob);
// merge with existing crawl db
if (LOG.isInfoEnabled()) {
LOG.info("Injector: Merging injected urls into crawl db.");
}
JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
FileInputFormat.addInputPath(mergeJob, tempDir);
mergeJob.setReducerClass(InjectReducer.class);
JobClient.runJob(mergeJob);
CrawlDb.install(mergeJob, crawlDb);
// clean up
FileSystem fs = FileSystem.get(getConf());
fs.delete(tempDir, true);
long end = System.currentTimeMillis();
LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: "
+ TimingUtil.elapsedTime(start, end));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment