Wednesday, April 20, 2016

apache nutch

apache-nutch-1.11
--------------------------- one shot
./bin/nutch parsechecker -dumpText http://www.deco.com > out.txt
./bin/nutch parsechecker -dumpText http://www.deco.com/cuisine/vos-visites |grep tc

wget -O - 'http://www.deco.com/cuisine/vos-visites' |grep tc_vars.http

--------------------------- crawl
./bin/crawl urls/seeds.txt out-crawl 5
(look at what the crawl script does, it is basically a loop calling bin/nutch)

./bin/nutch fetch 5-out-crawl/segments/20160717060128/ -noParsing  (the one did in crawl)

--------------------------- read /crawl db
./bin/nutch readdb out-crawl/crawldb -stats
./bin/nutch readdb 4-out-crawl/crawldb -dump 4-out-crawl-db-dump-20160512 -format csv -topN 1000

--------------------------- read /segments
./bin/nutch readseg -dump 4-out-crawl/segments/20160716234706 4-out-crawl-readseg-20160716234706
grep -ir outlink 3-out-crawl-readseg  --color  |wc

--------------------------- read one key from  a segment
./bin/nutch readseg -get 4-out-crawl/segments/20160717025923  "http://www.moulinex.fr/Cuisson/Croques-%26-Gaufres/c/waffle%2Bmakers"  -nocontent -noparse -noparsetext

--------------------------- read HTML from one key from  a segment
./bin/nutch readseg -get 2-out-crawl-wiki-nikon-num-round-2/segments/20160823085007/  "https://en.wikipedia.org/wiki/Nikon" -nofetch -nogenerate -noparse -noparsedata -noparsetext > Nikon.html

--------------------------- doc
https://wiki.apache.org/nutch/CommandLineOptions

--------------------------- plugin
bin/nutch plugin   parse-html   org.apache.nutch.parse.html.HtmlParser   index2.htm
https://wiki.apache.org/nutch/bin/nutch%20plugin
.
.