Created
October 2, 2018 23:24
-
-
Save anjackson/6f89e18e17765930b30bf33b742209f3 to your computer and use it in GitHub Desktop.
Checking CC WARC
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Processing record for url null from test file @0 Avail 503 length 503 | |
WARC-Payload-Digest: null | |
WARC-Block-Digest: null | |
Length was: 503 | |
SHA1 was: NRRSOBRU4N7OJZRFLQUJF7GFSJKV6GB4 | |
Processing record for url http://003sh.ou-net.com/blog/?p=7548 from test file @482 Avail 278 length 278 | |
WARC-Payload-Digest: null | |
WARC-Block-Digest: null | |
Length was: 278 | |
SHA1 was: DVBEEOF4QRZRUSQADENDSU3AG7U7Y2TZ | |
Processing record for url http://003sh.ou-net.com/blog/?p=7548 from test file @922 Avail 106253 length 106253 | |
WARC-Payload-Digest: sha1:RXO5ZZVAQRV37P6ZZUHD2WRBVPS4NRSP | |
WARC-Block-Digest: sha1:DAA7ZUHEZGVJAOWZFYHO6FQVAJV25E2T | |
Length was: 106253 | |
SHA1 was: DAA7ZUHEZGVJAOWZFYHO6FQVAJV25E2T | |
Processing record for url http://003sh.ou-net.com/blog/?p=7548 from test file @17669 Avail 292 length 292 | |
WARC-Payload-Digest: null | |
WARC-Block-Digest: null | |
Length was: 292 | |
SHA1 was: 3ZOKODHDJ27TF7CAYGASXVVQSCNY34YI |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* | |
*/ | |
package uk.bl.wa.analyser; | |
import java.io.ByteArrayOutputStream; | |
import java.io.IOException; | |
import java.net.MalformedURLException; | |
import java.security.MessageDigest; | |
import java.security.NoSuchAlgorithmException; | |
import java.util.Iterator; | |
import org.apache.poi.util.IOUtils; | |
import org.archive.io.ArchiveReader; | |
import org.archive.io.ArchiveReaderFactory; | |
import org.archive.io.ArchiveRecord; | |
import org.archive.util.Base32; | |
import uk.bl.wa.util.Normalisation; | |
/** | |
* @author Andrew Jackson <[email protected]> | |
* | |
*/ | |
public class SimpleWARCAnalyser { | |
/** | |
* @param args | |
* @throws IOException | |
* @throws MalformedURLException | |
* @throws NoSuchAlgorithmException | |
*/ | |
public static void main(String[] args) | |
throws MalformedURLException, IOException, | |
NoSuchAlgorithmException { | |
ArchiveReader reader = ArchiveReaderFactory.get("/Users/andy/Documents/workspace/warc-discovery/CC-MAIN-20180814062251-20180814082251-00000.warc.gz"); | |
Iterator<ArchiveRecord> ir = reader.iterator(); | |
int recordCount = 0; | |
int lastFailedRecord = 0; | |
// Iterate though each record in the WARC file | |
while (ir.hasNext() && recordCount < 4) { | |
ArchiveRecord rec = null; | |
try { | |
rec = ir.next(); | |
} catch (RuntimeException e) { | |
System.err.println("Exception on record after rec " + recordCount + " from test file. " + e); | |
if (lastFailedRecord != recordCount) { | |
lastFailedRecord = recordCount; | |
continue; | |
} | |
System.err.println( | |
"Failed to reach next record, last record already on error - skipping the rest of the records"); | |
break; | |
} | |
final String url = Normalisation.sanitiseWARCHeaderValue(rec.getHeader().getUrl()); | |
System.out.println("\n\nProcessing record for url " + url | |
+ " from test file" + " @" | |
+ rec.getHeader().getOffset() + " Avail " | |
+ rec.available() + " length " | |
+ rec.getHeader().getContentLength() | |
+ "\nWARC-Payload-Digest: " | |
+ rec.getHeader().getHeaderValue("WARC-Payload-Digest") | |
+ "\nWARC-Block-Digest: " | |
+ rec.getHeader().getHeaderValue("WARC-Block-Digest")); | |
ByteArrayOutputStream bos = new ByteArrayOutputStream(); | |
IOUtils.copy(rec, bos); | |
System.err.println("Length was: " + bos.size()); | |
System.err.println("SHA1 was: " + Base32.encode(MessageDigest | |
.getInstance("SHA1").digest(bos.toByteArray()))); | |
recordCount += 1; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment