uniqueXPaths.pl

Tagged:  •    •    •    •    •    •    •  
#!/usr/bin/perl -w
# uniqueXPaths.pl reports all the unique element paths and their
# frequencies of the XML file supplied as the only mandatory
# argument on the command line.
#
# Copyright 2003, Ramiro Gómez.
#
# This program is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
use strict;
use XML::Parser;
use utf8; # Unicode support (not needed with Perl 5.8)

die "Usage: $0 XMLfile" unless @ARGV;
my $xmlfile = shift;
my %paths; # Hash that stores XPaths and their frequencies

# create parser object
my $parser = new XML::Parser('Handlers' => {
'Start' => \&h_start
});

$parser->parsefile($xmlfile);

# report variables
my ($path, $freq);

# define report format
format STDOUT_TOP =
Element Paths Page: @>>>>>
$%
Frequency Path
------------------------------------------------------------------
.
format STDOUT =
@>>>>>>>> @*
$freq, $path
.

# print report
map {
$path = $_;
$freq = $paths{$_};
write;
} sort keys %paths;

# Event handler for start elements
sub h_start {
my ($expat, $element) = @_; # $expat = expat parser object
my $path;
# 'context()' returns a list of element names that represent open elements
if ($expat->context()) {
$path = '/' . join( '/', ( $expat->context() ) ) . "/$element";
$paths{$path}++;
} else {
$path = "/$element";
$paths{$path}++;
}
}

Post new comment

The content of this field is kept private and will not be shown publicly.
  • Web page addresses and e-mail addresses turn into links automatically.
  • Allowed HTML tags: <a> <em> <strong> <cite> <code> <ul> <ol> <li> <dl> <dt> <dd> <p> <br>

More information about formatting options