Created
October 19, 2017 14:20
-
-
Save sebastian-nagel/ff4379f9e2115d3c922416d520274b86 to your computer and use it in GitHub Desktop.
Link path identifiers from a single Common Crawl WAT file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#% zgrep '^{"Container' .../CC-MAIN-XXX-XXX.warc.wat.gz \ | |
# | jq --raw-output '."Envelope"."Payload-Metadata"."HTTP-Response-Metadata"."HTML-Metadata"."Links"[]?.path' \ | |
# | sort | uniq -c | sort -k1,1nr | |
# see also: | |
# https://github.com/commoncrawl/ia-web-commons/issues/9 | |
# https://github.com/commoncrawl/ia-web-commons/issues/8 | |
# https://github.com/iipc/webarchive-commons/pull/72 | |
7777908 A@/href | |
1266284 IMG@/src | |
90022 STYLE/#text | |
82498 FORM@/action | |
30165 A@/data-href | |
29271 IFRAME@/src | |
12383 DIV@/data-href | |
9034 TD@/background | |
8339 AREA@/href | |
7932 SPAN@/data-href | |
7595 INPUT@/src | |
6296 IMG@/longdesc | |
2524 EMBED@/src | |
1521 TABLE@/background | |
1481 BUTTON@/data-href | |
1125 BLOCKQUOTE@/cite | |
995 OBJECT@/codebase | |
860 OBJECT@/data | |
608 SOURCE@/src | |
405 LI@/data-href | |
378 INPUT@/data-href | |
370 BODY@/background | |
351 LABEL@/data-href | |
307 null | |
210 VIDEO@/poster | |
191 TR@/data-href | |
158 IMG@/data-href | |
101 DIV@/data-uri | |
73 FRAME@/src | |
57 APPLET@/codebase | |
45 AUDIO@/src | |
41 VIDEO@/src | |
35 OPTION@/data-href | |
35 P@/data-href | |
32 A@/data-uri | |
30 B@/data-href | |
30 TR@/background | |
27 SECTION@/data-href | |
26 G@/data-href | |
26 SAMP@/data-href | |
24 FIGURE@/data-href | |
24 SPAN@/data-uri | |
20 H3@/data-href | |
17 TH@/background | |
15 BUTTON@/data-uri | |
14 LI@/data-uri | |
12 SELECT@/data-href | |
11 FB:LOGIN-BUTTON@/background | |
11 I@/data-href | |
11 LINK@/data-href | |
8 SECTION@/data-uri | |
8 TRACK@/src | |
7 STYLE@/data-href | |
6 FORM@/data-href | |
5 FB:COMMENTS@/data-href | |
4 ASIDE@/data-uri | |
4 BUTTON@/formaction | |
4 NOSCRIPT@/data-href | |
4 SUP@/data-href | |
3 !—DIV@/data-href | |
3 G:PLUSONE@/data-href | |
3 P@/data-uri | |
2 ACLASS@/data-href | |
2 ARTICLE@/data-href | |
2 ARTICLE@/data-uri | |
2 H1@/data-href | |
2 INS@/cite | |
2 SCRIPT@/data-href | |
2 UL@/data-href | |
1 ASIDE@/data-href | |
1 FB:LIKE-BOX@/data-href | |
1 FB:LIKE@/data-href | |
1 FIGURE@/data-uri | |
1 FOOTER@/data-uri | |
1 HTML@/data-uri | |
1 IFRAME@/background | |
1 IMG@/background | |
1 IMG@/data-uri | |
1 INPUT@/data-uri | |
1 NAV@/data-uri | |
1 NOSCRIPT@/data-uri | |
1 Q@/cite |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment