xecutioner · August 29, 2015 13:57
diff --git a/Wikimeida_extraction b/Wikimeida_extraction
 # Use medialab's extractor tool http://medialab.di.unipi.it/wiki/Wikipedia_Extractor
 > wget http://download.wikimedia.org/itwiki/latest/itwiki-latest-pages-articles.xml.bz2
 > bzcat itwiki-latest-pages-articles.xml.bz2 |
 WikiExtractor.py -cb 250K -o extracted
 In order to combine the whole extracted text into a single file one can issue:
 > find extracted -name '*bz2' -exec bunzip2 -c {} \; > wiki_parsed.xml
 > rm -rf extracted

 # Remove any untouched incomplete tags
 sed -i.bak -e 's/<[^doc>/!][^ >][^>]*>//g;s/<\/[^doc>][^>]*>//g' wiki_parsed.xml


 # Prepend a head tag to validate a proper xml to be parsed by xml parsers
 echo "<head>"|cat - yourfile > /tmp/out && mv /tmp/out yourfile
 OR 
 sed -i '1s/^/<head> /' file

 check if the last doc tag is closed or not if not close it. 

 #Close the head tag
 echo "</head>" >> file


 # Incase we want to go with the split the big xml into the smaller files one for each article.
 awk '/<doc/{x="F"++i;}{print > x;}' wiki_parsed.xml

 --- more http://www.theunixschool.com/2012/06/awk-10-examples-to-split-file-into.html
	# Use medialab's extractor tool http://medialab.di.unipi.it/wiki/Wikipedia_Extractor
	> wget http://download.wikimedia.org/itwiki/latest/itwiki-latest-pages-articles.xml.bz2
	> bzcat itwiki-latest-pages-articles.xml.bz2 \|
	WikiExtractor.py -cb 250K -o extracted
	In order to combine the whole extracted text into a single file one can issue:
	> find extracted -name '*bz2' -exec bunzip2 -c {} \; > wiki_parsed.xml
	> rm -rf extracted

	# Remove any untouched incomplete tags
	sed -i.bak -e 's/<[^doc>/!][^ >][^>]>//g;s/<\/[^doc>][^>]>//g' wiki_parsed.xml


	# Prepend a head tag to validate a proper xml to be parsed by xml parsers
	echo "<head>"\|cat - yourfile > /tmp/out && mv /tmp/out yourfile
	OR
	sed -i '1s/^/<head> /' file

	check if the last doc tag is closed or not if not close it.

	#Close the head tag
	echo "</head>" >> file


	# Incase we want to go with the split the big xml into the smaller files one for each article.
	awk '/<doc/{x="F"++i;}{print > x;}' wiki_parsed.xml

	--- more http://www.theunixschool.com/2012/06/awk-10-examples-to-split-file-into.html
No results found