Created
October 5, 2016 06:59
-
-
Save sebastian-nagel/eee8ed036ee89b1ae09f1124ccfa06d7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#### extract news sites from DMOZ.org #### | |
# dependencies | |
# Linux | |
# bash | |
# wget | |
# perl | |
# regexp-assemble | |
set -e | |
set -x | |
# download data, see http://www.dmoz.org/rdf.html | |
test -e content.rdf.u8.gz || wget http://rdf.dmoz.org/rdf/content.rdf.u8.gz | |
test -e structure.rdf.u8.gz || wget http://rdf.dmoz.org/rdf/structure.rdf.u8.gz | |
test -e categories.txt.gz || wget http://rdf.dmoz.org/rdf/categories.txt.gz | |
# extract data for set of categories, e.g. | |
# | |
# <Topic r:id="Top/News/Newspapers/International"> | |
# <catid>998669</catid> | |
# <link r:resource="http://mondediplo.com/"></link> | |
# <link r:resource="http://www.theepochtimes.com/"></link> | |
# <link r:resource="http://global.nytimes.com/"></link> | |
# <link r:resource="http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml"></link> | |
# </Topic> | |
# | |
# output directory | |
LINK_DIR=news | |
test -d $LINK_DIR || mkdir $LINK_DIR | |
# regular expression (Perl) to match relevant categories | |
# - a list to start from: http://rdf.dmoz.org/rdf/categories.txt.gz | |
CAT_REGEX='News/(?:Newspapers|Magazines_and_E-zines|Breaking_News|Analysis_and_Opinion|By_(?:Category|Region|Subject))|.*/(?:Newspapers|Magazines_and_E-zines|News_and_Media)' | |
# additional hand-crafted patterns to match language-specific or regional categories | |
# not covered by translations from English categories | |
CAT_REGEX_LANGUAGE_SPECIFIC='(?:.*/(?:Nachrichten_und_Medien|Zeitschriften_und_Online-Magazine|Новости(?:_и_СМИ)|Periódicos|Journeaux|Magazines_et_e-zines))' | |
# matched categories | |
zcat categories.txt.gz \ | |
| perl -Mutf8 -CDS -lne 'print if m@^(?:'"$CAT_REGEX"'|'"$CAT_REGEX_LANGUAGE_SPECIFIC"')@' \ | |
>$LINK_DIR/dmoz_categories_matched_by_regex.txt | |
# get multilingual list of categories (translated from English category names) | |
zcat structure.rdf.u8.gz \ | |
| perl -Mutf8 -CDS -lne ' | |
if (m@^\s*<Topic\s+r:id="Top/(?:'"$CAT_REGEX"')@ .. m@^\s*</Topic>@) { | |
print $1 if /<altlang r:resource="[^:]+:([^"]+)"/; | |
}' \ | |
>$LINK_DIR/dmoz_categories_translations.txt | |
# convert to regular expression (using regexp-assemble) | |
REGEXP_ASSEMBLE=`which regexp-assemble` | |
CAT_REGEX_TRANSLATIONS="`perl -CDS $REGEXP_ASSEMBLE $LINK_DIR/dmoz_categories_translations.txt`" | |
zcat content.rdf.u8.gz \ | |
| perl -Mutf8 -CDS -lne ' | |
if (m@^\s*<Topic\s+r:id="(Top/(?:'"$CAT_REGEX"'|'"$CAT_REGEX_LANGUAGE_SPECIFIC"')|(?:'"$CAT_REGEX_TRANSLATIONS"'))@ .. m@^\s*</Topic>@) { | |
$c = $1; | |
$category = $c if m@^\s*<Topic@; | |
print $category, "\t", $1 if /<link r:resource="([^"]+)"/; | |
}' \ | |
| sort -u \ | |
>$LINK_DIR/dmoz_links_by_category.txt | |
perl -lne 'next unless s@^Top/World/@@; s@/.+@@; print' $LINK_DIR/dmoz_links_by_category.txt \ | |
| sort | uniq -c | sort -k1,1nr \ | |
>$LINK_DIR/dmoz_links_world_by_country_counts.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment