Perl XML stream parser

From AJS.COM

Jump to: navigation, search

The following program is referenced at:

http://www.xml.com/pub/a/2000/04/05/feature/

but the original URL at which it was located:

http://www.xmltwig.cx/perl_xml/ex_parser

went away. As a result, I'm archiving it here with the understanding that the copyright and licensing of the following code is not at all clear, so use it as a reference, only please.

#!/bin/perl -w

#######################################################################
#                                                                     #
#  This tool goes to a file tagged with the tstmt.dtd (used for the   #
#  quran http://metalab.unc.edu/xml/examples/religion/quran/quran.xml #
#  extracts the title number from bktlong and bktshort elements and   #
#  creates both a num attribute for the title and a num element       #
#  inside the title element.                                          #
#  <bktlong>1. The Opening</bktlong> becomes                          #
#  <bktlong num="1"><num>1</num>The Opening</bktlong>                 #
#                                                                     #
#  Note: this version outputs the text in UTF-8                       #
#                                                                      #
#######################################################################

use strict;
use XML::Parser;

my $in_title=0;

my $parser= new XML::Parser( Style => 'Stream');

if( $ARGV[0]) { $parser->parsefile( $ARGV[0]); }   # parse the file
else          { $parser->parse( \*STDIN);      }

exit;

sub StartTag                                       # called for all star tags
{ my( $p, $gi, %att)= @_;
    if( ($gi eq 'bktlong') || ($gi eq 'bktshort') )
      { print "<$gi ";                             # will be closed in Text
        $in_title=1;                               # triggers Text processing 
      }
    else
      { print $p->recognized_string(); }           # else just print
  }

sub Text                                           # called for each string
  { if( $in_title)                                 # if in title
      { my ($title_no, $title_text)=               # separate the num from
             (/\A(\d+)\.?\s*(.*)\Z/);               # the rest of the text
        print "num=\"$title_no\">";                # close the title tag
        print "<num>$title_no</num>$title_text";   # print the num alement
        $in_title=0;                               # not in title anymore
      }
    else
      { print ; }                                  # else just print
  }
Personal tools