sebastian-nagel · October 5, 2016 06:59
diff --git a/get_dmoz_news_links.sh b/get_dmoz_news_links.sh
 #!/bin/bash

 #### extract news sites from DMOZ.org ####

 # dependencies
 #  Linux
 #  bash
 #  wget
 #  perl
 #  regexp-assemble

 set -e
 set -x

 # download data, see http://www.dmoz.org/rdf.html
 test -e content.rdf.u8.gz   || wget http://rdf.dmoz.org/rdf/content.rdf.u8.gz
 test -e structure.rdf.u8.gz || wget http://rdf.dmoz.org/rdf/structure.rdf.u8.gz
 test -e categories.txt.gz   || wget http://rdf.dmoz.org/rdf/categories.txt.gz

 # extract data for set of categories, e.g.
 #
 # <Topic r:id="Top/News/Newspapers/International">
 #   <catid>998669</catid>
 #   <link r:resource="http://mondediplo.com/"></link>
 #   <link r:resource="http://www.theepochtimes.com/"></link>
 #   <link r:resource="http://global.nytimes.com/"></link>
 #   <link r:resource="http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml"></link>
 # </Topic>
 #

 # output directory
 LINK_DIR=news

 test -d $LINK_DIR || mkdir $LINK_DIR

 # regular expression (Perl) to match relevant categories
 # - a list to start from: http://rdf.dmoz.org/rdf/categories.txt.gz
 CAT_REGEX='News/(?:Newspapers|Magazines_and_E-zines|Breaking_News|Analysis_and_Opinion|By_(?:Category|Region|Subject))|.*/(?:Newspapers|Magazines_and_E-zines|News_and_Media)'

 # additional hand-crafted patterns to match language-specific or regional categories
 # not covered by translations from English categories
 CAT_REGEX_LANGUAGE_SPECIFIC='(?:.*/(?:Nachrichten_und_Medien|Zeitschriften_und_Online-Magazine|Новости(?:_и_СМИ)|Periódicos|Journeaux|Magazines_et_e-zines))'

 # matched categories
 zcat categories.txt.gz \
     | perl -Mutf8 -CDS -lne 'print if m@^(?:'"$CAT_REGEX"'|'"$CAT_REGEX_LANGUAGE_SPECIFIC"')@' \
     >$LINK_DIR/dmoz_categories_matched_by_regex.txt

 # get multilingual list of categories (translated from English category names)
 zcat structure.rdf.u8.gz \
     | perl -Mutf8 -CDS -lne '
          if (m@^\s*<Topic\s+r:id="Top/(?:'"$CAT_REGEX"')@ .. m@^\s*</Topic>@) {
            print $1 if /<altlang r:resource="[^:]+:([^"]+)"/;
          }' \
     >$LINK_DIR/dmoz_categories_translations.txt

 # convert to regular expression (using regexp-assemble)
 REGEXP_ASSEMBLE=`which regexp-assemble`
 CAT_REGEX_TRANSLATIONS="`perl -CDS $REGEXP_ASSEMBLE $LINK_DIR/dmoz_categories_translations.txt`"

 zcat content.rdf.u8.gz \
     | perl -Mutf8 -CDS -lne '
          if (m@^\s*<Topic\s+r:id="(Top/(?:'"$CAT_REGEX"'|'"$CAT_REGEX_LANGUAGE_SPECIFIC"')|(?:'"$CAT_REGEX_TRANSLATIONS"'))@ .. m@^\s*</Topic>@) {
            $c = $1;
            $category = $c if m@^\s*<Topic@;
            print $category, "\t", $1 if /<link r:resource="([^"]+)"/;
          }' \
     | sort -u \
     >$LINK_DIR/dmoz_links_by_category.txt

 perl -lne 'next unless s@^Top/World/@@; s@/.+@@; print' $LINK_DIR/dmoz_links_by_category.txt \
    | sort | uniq -c | sort -k1,1nr \
    >$LINK_DIR/dmoz_links_world_by_country_counts.txt
	#!/bin/bash

	#### extract news sites from DMOZ.org ####

	# dependencies
	# Linux
	# bash
	# wget
	# perl
	# regexp-assemble

	set -e
	set -x

	# download data, see http://www.dmoz.org/rdf.html
	test -e content.rdf.u8.gz \|\| wget http://rdf.dmoz.org/rdf/content.rdf.u8.gz
	test -e structure.rdf.u8.gz \|\| wget http://rdf.dmoz.org/rdf/structure.rdf.u8.gz
	test -e categories.txt.gz \|\| wget http://rdf.dmoz.org/rdf/categories.txt.gz

	# extract data for set of categories, e.g.
	#
	# <Topic r:id="Top/News/Newspapers/International">
	# <catid>998669</catid>
	# <link r:resource="http://mondediplo.com/"></link>
	# <link r:resource="http://www.theepochtimes.com/"></link>
	# <link r:resource="http://global.nytimes.com/"></link>
	# <link r:resource="http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml"></link>
	# </Topic>
	#

	# output directory
	LINK_DIR=news

	test -d $LINK_DIR \|\| mkdir $LINK_DIR

	# regular expression (Perl) to match relevant categories
	# - a list to start from: http://rdf.dmoz.org/rdf/categories.txt.gz
	CAT_REGEX='News/(?:Newspapers\|Magazines_and_E-zines\|Breaking_News\|Analysis_and_Opinion\|By_(?:Category\|Region\|Subject))\|.*/(?:Newspapers\|Magazines_and_E-zines\|News_and_Media)'

	# additional hand-crafted patterns to match language-specific or regional categories
	# not covered by translations from English categories
	CAT_REGEX_LANGUAGE_SPECIFIC='(?:.*/(?:Nachrichten_und_Medien\|Zeitschriften_und_Online-Magazine\|Новости(?:_и_СМИ)\|Periódicos\|Journeaux\|Magazines_et_e-zines))'

	# matched categories
	zcat categories.txt.gz \
	\| perl -Mutf8 -CDS -lne 'print if m@^(?:'"$CAT_REGEX"'\|'"$CAT_REGEX_LANGUAGE_SPECIFIC"')@' \
	>$LINK_DIR/dmoz_categories_matched_by_regex.txt

	# get multilingual list of categories (translated from English category names)
	zcat structure.rdf.u8.gz \
	\| perl -Mutf8 -CDS -lne '
	if (m@^\s<Topic\s+r:id="Top/(?:'"$CAT_REGEX"')@ .. m@^\s</Topic>@) {
	print $1 if /<altlang r:resource="[^:]+:([^"]+)"/;
	}' \
	>$LINK_DIR/dmoz_categories_translations.txt

	# convert to regular expression (using regexp-assemble)
	REGEXP_ASSEMBLE=`which regexp-assemble`
	CAT_REGEX_TRANSLATIONS="`perl -CDS $REGEXP_ASSEMBLE $LINK_DIR/dmoz_categories_translations.txt`"

	zcat content.rdf.u8.gz \
	\| perl -Mutf8 -CDS -lne '
	if (m@^\s<Topic\s+r:id="(Top/(?:'"$CAT_REGEX"'\|'"$CAT_REGEX_LANGUAGE_SPECIFIC"')\|(?:'"$CAT_REGEX_TRANSLATIONS"'))@ .. m@^\s</Topic>@) {
	$c = $1;
	$category = $c if m@^\s*<Topic@;
	print $category, "\t", $1 if /<link r:resource="([^"]+)"/;
	}' \
	\| sort -u \
	>$LINK_DIR/dmoz_links_by_category.txt

	perl -lne 'next unless s@^Top/World/@@; s@/.+@@; print' $LINK_DIR/dmoz_links_by_category.txt \
	\| sort \| uniq -c \| sort -k1,1nr \
	>$LINK_DIR/dmoz_links_world_by_country_counts.txt