#!/usr/bin/perl # -*- mode: cperl; indent-tabs-mode: nil; tab-width: 3; cperl-indent-level: 3; -*- use warnings; use strict; use utf8; BEGIN { $| = 1; binmode(STDIN, ':encoding(UTF-8)'); binmode(STDOUT, ':encoding(UTF-8)'); } use open qw( :encoding(UTF-8) :std ); use Getopt::Long; Getopt::Long::Configure('no_ignore_case'); my %opts = (); GetOptions (\%opts, 'help|?', 'grammar|g=s', 'output|o', 'strip', 'secondary', 'regex', 'icase', 'baseforms', 'wordforms', 'all', ); sub print_help { print <<'XOUT'; Usage: cg-strictify [OPTIONS] Options: -?, --help outputs this help -g, --grammar the grammar to parse; defaults to first non-option argument -o, --output outputs the whole grammar with STRICT-TAGS --strip removes superfluous LISTs from the output grammar; implies -o --secondary adds secondary tags (<...>) to strict list --regex adds regular expression tags (/../r, <..>r, etc) to strict list --icase adds case-insensitive tags to strict list --baseforms adds baseform tags ("...") to strict list --wordforms adds wordform tags ("<...>") to strict list --all same as --strip --secondary --regex --icase --baseforms --wordforms XOUT } if (defined $opts{'help'}) { print_help(); exit(0); } if (!defined $opts{'grammar'}) { if (!$ARGV[0]) { print "Missing input grammar argument!\n\n"; print_help(); exit(-1); } $opts{'grammar'} = $ARGV[0]; } if (! -s $opts{'grammar'}) { print "Grammar is either missing or empty!\n"; exit(-1); } if (defined $opts{'strip'}) { $opts{'output'} = 1; } if (defined $opts{'all'}) { $opts{'secondary'} = 1; $opts{'regex'} = 1; $opts{'icase'} = 1; $opts{'baseforms'} = 1; $opts{'wordforms'} = 1; } my $tags = `vislcg3 --show-tags -g '$opts{grammar}' | LC_ALL=C sort`; $tags =~ s@[\r\n]+@\n@g; # Normalize newlines chomp($tags); my @tags = split /\n/, $tags; my $options = ''; if (defined $opts{'secondary'} || defined $opts{'regex'} || defined $opts{'icase'} || defined $opts{'baseforms'} || defined $opts{'wordforms'}) { $options .= 'OPTIONS +='; if (defined $opts{'secondary'}) { $options .= ' strict-secondary'; } if (defined $opts{'regex'}) { $options .= ' strict-regex'; } if (defined $opts{'icase'}) { $options .= ' strict-icase'; } if (defined $opts{'baseforms'}) { $options .= ' strict-baseforms'; } if (defined $opts{'wordforms'}) { $options .= ' strict-wordforms'; } $options .= " ;\n"; } my $strict_tags = 'STRICT-TAGS +='; my @strict = (); my $lf = ''; foreach my $tag (@tags) { if ($tag eq '*') { next; # Marker for erasure, any, etc. } if ($tag eq '>>>') { next; # Start of window marker } if ($tag eq '<<<') { next; # End of window marker } if ($tag =~ m@^\^@) { next; # Fail-fast tags } if ($tag =~ m@^VSTR:@) { next; # Varstrings } if ($tag =~ m@^VAR:@) { next; # Global variables } if ($tag =~ m@^_.+_$@) { next; # Magic placeholders such as _TARGET_ } if ($tag =~ m@dummy string@) { next; # Internal placeholder for tag #0 } if ($tag =~ m@[>"/]v$@) { next; } if (! defined $opts{'secondary'} && $tag =~ m@^<.+>[riv]*$@) { next; } if (! defined $opts{'regex'} && $tag =~ m@^/.+/[riv]*$@) { next; } if (! defined $opts{'icase'} && $tag =~ m@[>"/]i$@) { next; } if (! defined $opts{'baseforms'} && $tag =~ m@^"[^<].*"[riv]*$@) { next; } if (! defined $opts{'wordforms'} && $tag =~ m@^"<.*>"[riv]*$@) { next; } my $nlf = substr($tag, 0, 1); if ($nlf =~ m@^\p{Lu}@) { $nlf = 'A'; } elsif ($nlf =~ m@^\p{Ll}@) { $nlf = 'a'; } elsif ($tag =~ m@^"<.+>"[riv]*$@) { $nlf = 'W'; } elsif ($tag =~ m@^".+"[riv]*$@) { $nlf = 'B'; } if ($lf && $lf ne $nlf) { $strict_tags .= "\n"; } $lf = $nlf; $strict_tags .= " $tag"; push(@strict, $tag); } $strict_tags .= " ;\n"; if (! defined $opts{'output'}) { print "Put these lines at the top of your grammar, and edit the STRICT-TAGS list by removing invalid tags:\n\n"; print $options; print $strict_tags; exit(0); } my $g = ''; { local $/ = undef; open(FH, '<'.$opts{'grammar'}); $g = ; close(FH); } if (defined $opts{'strip'}) { print STDERR "Deleting unnecessary LISTs...\n"; foreach my $tag (@strict) { if ($g =~ m@[\b\n][Ll][Ii][Ss][Tt]\s+\Q$tag\E\s*=\s*\Q$tag\E\s*;@) { print STDERR "Deleting LIST $tag\n"; $g =~ s@[\b\n][Ll][Ii][Ss][Tt]\s+\Q$tag\E\s*=\s*\Q$tag\E\s*;@@; } } } print $options; print $strict_tags; print "\n"; print $g;