Perl XML stream parser
From AJS.COM
The following program is referenced at:
but the original URL at which it was located:
went away. As a result, I'm archiving it here with the understanding that the copyright and licensing of the following code is not at all clear, so use it as a reference, only please.
#!/bin/perl -w ####################################################################### # # # This tool goes to a file tagged with the tstmt.dtd (used for the # # quran http://metalab.unc.edu/xml/examples/religion/quran/quran.xml # # extracts the title number from bktlong and bktshort elements and # # creates both a num attribute for the title and a num element # # inside the title element. # # <bktlong>1. The Opening</bktlong> becomes # # <bktlong num="1"><num>1</num>The Opening</bktlong> # # # # Note: this version outputs the text in UTF-8 # # # ####################################################################### use strict; use XML::Parser; my $in_title=0; my $parser= new XML::Parser( Style => 'Stream'); if( $ARGV[0]) { $parser->parsefile( $ARGV[0]); } # parse the file else { $parser->parse( \*STDIN); } exit; sub StartTag # called for all star tags { my( $p, $gi, %att)= @_; if( ($gi eq 'bktlong') || ($gi eq 'bktshort') ) { print "<$gi "; # will be closed in Text $in_title=1; # triggers Text processing } else { print $p->recognized_string(); } # else just print } sub Text # called for each string { if( $in_title) # if in title { my ($title_no, $title_text)= # separate the num from (/\A(\d+)\.?\s*(.*)\Z/); # the rest of the text print "num=\"$title_no\">"; # close the title tag print "<num>$title_no</num>$title_text"; # print the num alement $in_title=0; # not in title anymore } else { print ; } # else just print }
BlogMarks
del.icio.us
digg
Fark
Furl
Newsvine
reddit
Segnalo
Simpy
Slashdot
smarking
Spurl
StumbleUpon
Wists