starting from vanilla centos 6.2 install ( CentOS-6.2-i386-LiveCD.iso )
following https://ccp.cloudera.com/display/CDH4B1/CDH4+Installation
installed oracle jdk
./jdk-6u29-linux-i586-rpm.bin
step1 add or build repo
index.search("europe").hits.each { |hit| puts hit.inspect } | |
#<struct Ferret::Search::Hit doc=6, score=0.446250796318054> | |
#<struct Ferret::Search::Hit doc=7, score=0.446250796318054> | |
#<struct Ferret::Search::Hit doc=8, score=0.446250796318054> | |
puts index[7].load.inspect | |
{:continent=>"europe", :name=>"London"} |
#!/usr/bin/env ruby | |
article = '' | |
STDIN.each do |line| | |
begin | |
line.chomp! | |
if line == '---END.OF.DOCUMENT---' | |
puts "0\t#{article}" | |
article = '' |
# "a.b:1" => [["a", "b", 1]] | |
# "a.b:1,c.d:2" => [["a", "b", 1], ["c", "d", 2]] | |
grammar Hadoop18Counters | |
rule counter_records | |
counter_record ("," counter_record)* { | |
def to_list | |
items = elements[0].to_list | |
if elements[1] |
mkfifo articles | |
hadoop fs -copyFromLocal articles /full/articles-2011-07-08/freebase-wex-2011-07-08-articles.tsv & | |
curl http://download.freebase.com/wex/2011-07-08/freebase-wex-2011-07-08-articles.tsv.bz2 | bunzip2 > articles & | |
#win |
print "weighted cost", weighted_cost | |
print "unweighted cost", weighted_cost |
items in common crawl with a mime type text/html with at least one byte of visible text | |
see https://github.com/matpalm/common-crawl/tree/master/analysis | |
eg there are 880,125,891 urls that were been crawled once, 182,752,019 that were crawled twice, etc | |
times crawled freq | |
1 880125891 | |
2 182752019 | |
3 44573683 | |
4 9448470 |
starting from vanilla centos 6.2 install ( CentOS-6.2-i386-LiveCD.iso )
following https://ccp.cloudera.com/display/CDH4B1/CDH4+Installation
installed oracle jdk
./jdk-6u29-linux-i586-rpm.bin
step1 add or build repo
mat@matpc:/tmp$ echo "thе" | hexdump -C | |
00000000 74 68 d0 b5 0a |th...| | |
00000005 | |
mat@matpc:/tmp$ echo "the" | hexdump -C | |
00000000 74 68 65 0a |the.| | |
00000004 |
javascript:(function(){ | |
$('a.lecture-link').each(function (index){ | |
var $lectureLink = $(this); | |
var downloadLink = $lectureLink.attr('href').replace('view','download.mp4'); | |
var downloadName = '\"' + (index+1) + '.' + $lectureLink.text().trim() + '.mp4\"'; | |
var cookieHeader = ' --header \"Cookie:'+ document.cookie + '\" '; | |
console.log('curl -L ' + cookieHeader + downloadLink + ' > ' + downloadName); | |
}); | |
})(); |
import sys | |
# wget http://www.mieliestronk.com/corncob_lowercase.zip | |
# unzip corncob_lowercase.zip | |
# echo -e "i\na" >> corncob_lowercase.txt | |
q = [] | |
words = set() | |
for word in map(str.strip, sys.stdin): |