#!/usr/bin/perl
# -*- perl -*-
eval 'exec /usr/bin/perl -S $0 ${1+"$@"}'
    if 0;

## NOTE: version of frmg_lexer for lefff #svn > 1025

#use Encode qw/from_to decode encode/;

use strict;
use Carp;
use Memoize;
use UNIVERSAL qw(isa can);

use TAG;
use TAG::LP;

use AppConfig qw/:argcount :expand/;

use IPC::Run qw/start run/;
use IPC::Open2;

##use BerkeleyDB;
##use MLDBM qw{BerkeleyDB::Btree Storable};

## use Storable qw{dclone};

## This script works correctly if encoding is iso-8859-1
## and LC_ALL is derived from fr
## use of utf8 input is not very well addressed, but --utf8 may be used
use POSIX qw(locale_h);
use locale;
setlocale(LC_ALL,"fr");

#use Data::Dumper;
#use IO::Handle;

use File::Temp qw/tempfile/;

use lib '/usr/share/frmg';
use MyRegExp;
## use LexedGram;
use LefffDecoder;

#use Clone;

## Needed for user nobody to be able to load Data::Compare from some place where it has enough rights
BEGIN { chdir '/tmp' };

use Data::Compare;

use Lingua::Features;

my $config = AppConfig->new(
			    "lexed|l=f"       => {DEFAULT => "/usr/bin/lexed"},
			    "dicodir|p=f"     => {DEFAULT => "/usr/lib/lefff-frmg" },
			    "datadir|p=f"     => {DEFAULT => "/usr/share/frmg" },
			    "tmpdir=f"        => {DEFAULT => "/tmp"},
			    "dico|d=s"        => {DEFAULT => 'dico.xlfg' },
			    "macros|m=f"      => {DEFAULT => "dico.xlfg.templates"},
			    "locutions=f"     => {DEFAULT => "dico.xlfg.cmpnd"},
			    "prefixes=f"      => {DEFAULT => "dico.sxspell.pref"},
			    "map=f"           => {DEFAULT => "frgram.map"},
			    "complete=f"      => {DEFAULT => "complete.lex" },
			    "missing=f"       => {DEFAULT => "missing.lex" },
			    "restrictions=f"  => {DEFAULT => "restrictions.txt" },
			    "expand!"         => {DEFAULT => 1},
			    "vcomp_expand!"   => {DEFAULT => 1},
			    "sxpipe!"         => {DEFAULT => 1},
			    "sxpipecmd=f"     => {DEFAULT => 'sxpipe'},
			    "comment!"        => {DEFAULT => 0},
			    "cache|c=f",
			    "utf8!"           => {DEFAULT => 0},
			    "test_decoder!"   => {DEFAULT => 0},
			    "compile!" => {DEFAULT => 0},
			    "dbm!" => {DEFAULT => 0}
			   );

my $conffile;

foreach ("$ENV{HOME}/.frmgrc", "/etc/frmg.conf","/etc/frmg.conf") {
  $conffile=$_, last if -f $_ && -r $_;
}

# read configuration file
if ($conffile) {
  $config->file("$conffile")
    || die "can't open or process configuration file $conffile";
}

$config->args();

my $datadir       = $config->datadir();
my $dicodir       = $config->dicodir();
my $dico          = $config->dico();
my $macrosfile    = $dicodir . '/' . $config->macros();
my $locutionsfile = $dicodir . '/' . $config->locutions();
my $prefixesfile  = $dicodir . '/' . $config->prefixes();
my $mapfile       = $datadir . '/' . $config->map();
my $completefile  = $datadir . '/' . $config->complete();
my $missingfile   = $datadir . '/' . $config->missing();
my $restrfile     = $datadir . '/' . $config->restrictions();
my $expand        = $config->expand();
my $utf8          = $config->utf8();
my $dbm           = $config->dbm();
my $vcompexpand   = $config->vcomp_expand();
my $confcomment   = $config->comment();

my %unknown       = ();

my %cachelexed    = ();
my %countlexed    = ();

my $map = $TAG::LP::map = {};

parse_map_file($map, $mapfile);

TAG::LP::header_reader($datadir . '/' . $map->{'lp'}{'header'}) if (defined $map->{'lp'}{'header'});

memoize 'macro_expand';
##memoize 'entry_parse';

my @lexed = ($config->lexed,'-d',"$dicodir",'-p',"$dico",'consult');

##print "CMD @lexed\n";

my %locutions = ();

if (-r $locutionsfile) {
  open LOC,"<",$locutionsfile
    ||  die "can't open locution file: $!";
  while (<LOC>) {
    chomp;
    my @seq = split(/\s+/,$_);
    while (@seq) {
      my $prefix = join(' ',@seq);
      $prefix =~ s/' /'/og;
      $locutions{$prefix} ||= 1;
      pop @seq;
    }
  }
  close LOC;
}

my %prefixes = ();

if (-r $prefixesfile) {
  open PREF,"<",$prefixesfile
    ||  die "can't open prefixes file: $!";
  while (<PREF>) {
    next if m{^//};
    next if /^\s*$/;
    chomp;
    my ($pref) = split(/\s+/,$_);
    $prefixes{$pref} = 1;
  }
  close PREF;
}

## Hash of Hash of Feature Structures lemme->cat->fs
my $complete = {};

my $missing  = {};
my $redirect = {};

my $restrictions = {};

my $prefpat = join('|',sort {length($b) <=> length($a)} keys %prefixes);
$prefpat = qr/$prefpat/;

my %agglutinates = ( "au"         => ":prep le__det",
		     "aux"        => ":prep les__det",
		     "auquel"     => ":prep lequel",
		     "auxquels"   => ":prep lesquels",
		     "auxquelles" => ":prep lesquelles",
		     "du"         => "de:prep le__det",
		     "duquel"     => "de:prep lequel",
		     "des"        => "de:prep les__det",
		     "desquels"   => "de:prep lesquels",
		     "desquelles" => "de:prep lesquelles",
		     "s"         => "en:prep les__det",
		     "audit"      => ":prep ledit",
		     "auxdits"    => ":prep lesdits",
		     "auxdites"   => ":prep lesdites",
		     "dudit"      => "de:prep ledit",
		     "desdits"    => "de:prep lesdits",
		     "desdites"   => "de:prep lesdites",
		    );
my $agglpat = join('|',keys %agglutinates);
$agglpat = qr/$agglpat/i;

my $lexinfo = qr/\{\{(.*?)\}\}\s*/o;

my %macro = ();

my %allowed_cat = ( 'n'   => 1,
		    'v'   => 1,
		    'cl'  => 1,
		    'd'   => 1,
		    'adj' => 1
		  );

my %cat_transfer = (
		    'auxEtre'  => 'aux',
		    'auxAvoir' => 'aux',

		    # Lefff > 2.01
		    # should rather correspond to a modif of the grammar
		    ## Following are now handled through restrictions.txt
		    ## clar => 'cla',
		    ## cldr => 'cld',

		    ## Following are handled through restrictions.txt
		    ##		    ilimp => 'cln',
		    ##		    caimp => 'pro',

##		    advPref =>  'adv',
##		    adjPref => 'adj'

		    ## lefff #svn > 883

		    cfi => 'ncpred',
		    cf  => 'ncpred', # does not seem exact 

		    ## Lefff #svn > 994
		    parento => 'ponctw',
		    parentf => 'ponctw',

		    ## Lefff #svn > 1025
		    advm => 'adv',
		    advp => 'adv',

		   );

sub normalize_cat {
  my $cat = shift;
  return $cat_transfer{$cat} || $cat;
}


my %lingua = ();

foreach my $type (Lingua::Features::FeatureType->types()) {
  foreach my $idv ($type->values) {
    ##    print "Register type=".$type->id." idv=$idv v=".$type->value_name($idv)."\n";
    $lingua{$type->value_name($idv)} = $idv;
  }
}


my %value_transfer = (
		      'ssubj'    => 'subj',
		      'vsubj'    => 'subj',
		      'intens'   => 'intensive',

		      # Should use either short or long normalized values provided by Lingua::Features
		      # but unfortunately, we mix both notation for the grammar !
		      # => have to establish a manual list
		      'singular' => 'sg',
		      'plural'   => 'pl',
		      'sing'     => 'sg',
		      'true'     => '+',
		      'false'    => '-',

		     );

sub normalize_value {
  my $v = shift;
##  return $lingua{$v} || $value_transfer{$v} || $v;
  return $value_transfer{$v} || $v;
}

my %exclude_feature = (
##		       'lightverb' => 1,
		       'loc'       => 1, # to be activated as soon as possible
		       'form'      => 1,
		       'persposs'  => 1,
		       'advGP'     => 1, # found in Lefff > 2.01
#		       'imp'       => 1, # for impersonels (to be activated as soon as possible)
		       'clloc'     => 1,
		       'clgen'     => 1,
		       'clneg'     => 1,

		       ## Found in Lefff #svn > 883
		       'clla'      => 1, # Role of this feature within cll ?
		       'clle'      => 1, # Role of this feature within cll ?
		       'clles'     => 1, # Role of this feature within cll ?
		       'clloc'     => 1, # Role of this feature within cll ?

		       ## Found in Lefff #svn > 981
		       mod_kind => 1,

		       ## Found in Lefff #svn > 994
		       'RDexplication' => 1,
		       
		       ## Found in Lefff #svn > 1025
		       'clivee'     => 1,
		       'detach_neg' => 1,
		       'detach'     => 1,
		       'advi'       => 1,

		      );

my %exclude_cat_feature = (
			   'adj' => {
				     ## EVDLC: June 9th 2006: 
                                     ## Cause: Lefff evolution
				     ## Adjectives derived from past participle come with useless features
				     'mode' => 1,
				     'diathesis' => 1,
				     ## EVDLC: Nov 07 2007
				     ## Cause: Lefff evolution found in #svn >  909
				     'chunk_type' => 1
				    },
			   'adv' => {
				     ## EVDLC: Feb 28 2007
				     ## Cause: Lefff evolution found in #svn >  909
				     'chunk_type' => 1,

				     ## Found in Lefff #svn > 1078
				     'adv_parlant' => 1
				    },
			   'coo' => {
				     ## Found in Lefff #svn > 994
				     arg2 =>1
				    },
			   'prep' => { 'chunk_type' => 1 },
			   'pres' => { 'chunk_type' => 1 },
			  );

my %feature_transfer = (
			'v-form'        => 'mode',
			'form-aux'      => 'form_aux',
			'aux-req'       => 'aux_req',
			'demonstrative' => 'dem',
			'define'        => 'def',
			'locative'      => 'loc',
			'possessive'    => 'poss',
			'inflnumber'    => 'number',
			'inflperson'    => 'person',
			'qu'            => 'wh',
			## following ones are to reverse from English to French !
			'mood'          => 'mode',

			## Lefff #svn > 883
			'synt_head'     => 'lightverb',

			## Leff #svn > 911
			'weekday' => 'time',
			'year' => 'time',
			

		       );


my %lingua_feature_transfer = (
			       'mood'       => 'mode',
			       'number'     => 'num',
			       'person'     => 'pers',
			       'numberposs' => 'num',
			       'persposs'   => 'pers',
			       'poss'       => 'bool'
);


## List of features whose values are used in long form in the grammar
my %lingua_value_expand = (
			   'tense' => 1,
			   'mode'  => 1,
			   'mood'  => 1
);

sub normalize_feature {
  my $f = shift;
  return $feature_transfer{$f} || $f;
}

my %pcas_transfer = (
		     'loc' => '+' # should be more precise about locative prepositions
		    );

sub normalize_pcas {
  my $p = shift;
  return $pcas_transfer{$p} || $p;
}

## use NumbersGram;
## my $nbparser = new NumbersGram();

#$::RD_TRACE=1;

#$::RD_AUTOACTION = q{ $item[1] };

#$::RD_HINT=1;

sub features {
  my @l = @_;
  my @fs;
  while (@l) {
    my $f = shift @l;
    my $v = shift @l;
    next if $exclude_feature{$f};
    push(@fs,$f => TAG::Comp->new( {name  => $f,
				    rel   => 'eq',
				    value => $v
				   }
				 ));
  }
  return @fs;
}

sub val {
  my $v = shift;
  return TAG::Val->new({text=>$v});
}

sub syntax {
  my $parser = shift;
  my $lemma  = shift;
  my $subcat = shift;
  my $pos    = 0;
  ##  print "LEMMA $lemma\n";
  my $refl = ($lemma =~ s/Se$//o || $lemma =~ /^se_/) ? TAG::Plus->new() : TAG::Minus->new() ;
  my %args = ();
  my @subcat = @$subcat;
##  print "SYNTAX $lemma @subcat\n";
##  print Data::Dumper->Dump([$subcat],['subcat']);
  foreach my $arg ('arg0' .. 'arg2') {
    $args{$arg} = subcat (shift @subcat || { real => [ TAG::Minus->new() ]});
  }
  # print "SUBCAT ",dump(@args),"\n";
  my @args = features( lemma =>  [ val($lemma) ],
		       refl  => [$refl],
		       %args
		     );
  my $ht = TAG::Feature->new({type=>'ht',f=> {@args }});
##  print "SUBCAT ",Data::Dumper->Dump([$ht],['ht']),"\n";
  return $ht;
}

sub subcat {
  my $vals = shift;
  return subcat_with_kind($vals) if (exists $vals->{kind});
  my @pcas = ();
  my @vals = ();
  my %kind = ();
  ## print "SUBCAT ",Data::Dumper->Dump([$vals],['vals']),"\n";
  if ($vcompexpand
      && grep( $_->{text}  && $_->{text} eq 'scomp', @{$vals->{real}}) 
      && !grep( $_->{text} && $_->{text} eq 'vcomp', @{$vals->{real}}) ) {
    ## to overcome some current default of the lexicon
    push(@{$vals->{real}},val('vcomp'),val('-vcomp'),val('de-vcomp'));
  }
  my %pcas = ();
  foreach my $val (@{$vals->{real}}) {
##    next unless (defined $val->{text});
    $val->{text} ||= "";
    next if ($val->{text} && $val->{text} =~ /^arg\d/);
    my $newval = $val;
    if ($val->isa('TAG::Val')) {
      if  ($val->{text} && $val->{text} =~ /^(\S+)-(\S+)/) {
	my $pcas = normalize_pcas($1);
	push(@pcas,val($pcas)) unless (exists $pcas{$pcas});
	$pcas{$pcas} ||= 1;
##	next if (@pcas > 1);	# no need to push another prepobj if already present
	$newval = val("prep$2");
      }
    }
    next if ($newval->{text} && exists $kind{$newval->{text}});
    $kind{$newval->{text}} = 1;
    push(@vals,$newval);
  }
  push(@pcas, TAG::Minus->new());
  return  [ 
	   TAG::Feature->new( { type => 'arg',
				f    => { features( 'kind' => [@vals],
						    'pcas' => [@pcas]
						    )
					  }
			    } ) 
	  ];
}

my %subcat_convert = 
  (
    ## new notation in Lefff December 2006

   Suj    => { 
               sn       => 'subj',
	       sinf     => 'subj',
	       scompl   => 'subj',
	       qcompl   => 'subj',
	       cln      => 'subj',
	   },
   Obj    => { 
	      sn       => 'obj',
	      sinf     => 'vcomp',
	      scompl   => 'scomp',
	      qcompl   => 'whcomp',
	      sa       => 'acomp',	# found on prep
	      cla      => 'obj',
	      sadv     => 'sadv',    # to be added in frgm
	      cln => 'subj',	      # should not occur (but found in Lefff)
	   },
   'Obj' => {
               sn       => 'obj',
	       sinf     => 'vcomp',
	       scompl   => 'scomp',
	       qcompl   => 'whcomp',
	       cld      => 'obj',
	       y        => ['prepobj','cld'],	# to be refined
	   },
   Objde  => {
               sn       => 'obj',
	       sinf     => 'vcomp',
	       scompl   => 'scomp',
	       qcompl   => 'whcomp',
	       clg      => 'obj',
	       en       => ['prepobj','cll'],	# to be refined
	   },
   Att    => { 
               sa       => 'acomp',
	       sn       => 'acomp',
	       sinf     => 'vcomp',	# with obj-ctrl
	       cla      => 'acomp',
	       scompl   => 'scomp',
	       y        => ['prepobj','cll'],
	   },
   Obl    => {
               sn       => 'obj',
	       sinf     => 'vcomp',
	       scompl   => 'scomp',
	       qcompl   => 'whcomp',
	       en       => ['prepobj','cll'],	# to be refined
	   },
   Obl2   => {
               sn       => 'obj',
	       sinf     => 'vcomp',
	       scompl   => 'scomp',
	       qcompl   => 'whcomp',
	       en       => ['prepobj','cll'],	# to be refined
	   },
   Loc    => { # pb with loc-sn
	      sn       => 'obj',
	      y        => ['prepobj','cll'],	# to be refined
	      sinf => 'vcomp',
	      cll => ['prepobj','cll']
	   },
   Dloc   => { # pb with loc-sn
               sn       => 'obj',
	       en       => ['prepobj','cll'],	# to be refined
	   }

  );

sub subcat_with_kind {
  ## new version of subcat to handle new format of lefff (kind:real)
  ## should be revised when frmg will be able to exploit the new informations
  my $vals  = shift;
  my @pcas  = ();
  my @vals  = ();
  my @reals = ();
  my %kind  = ();
  my $kind  = $vals->{kind};
  my %pcas  = ();

##  print "SUBCAT ",Data::Dumper->Dump([$vals],['vals']),"\n";
  foreach my $val (@{$vals->{real}}) {
##    next unless (defined $val->{text});
    $val->{text} ||= "";
    next if ($val->{text} && $val->{text} =~ /^arg\d/);
    my $newval = $val;
    if ($val->isa('TAG::Val')) {
      if  ($val->{text} && $val->{text} =~ /^(\S+)-(\S+)/ && $kind !~ /[Ss]uj/) {
	## we do not yet transform prepsubj (should be done in BIOMG)
	my $pcas = normalize_pcas($1);
	push(@pcas,val($pcas)) unless (exists $pcas{$pcas});
	$pcas{$pcas} ||= 1;
##	next if (@pcas > 1);	# no need to push another prepobj if already present
	$newval = val("prep".$subcat_convert{$kind}{$2});
      } else {
	my $x = $val->{text};
	$x =~ s /^(\S+)-//og;
	my $info = $subcat_convert{$kind}{$x};
	if (ref($info)) {
	  my $real = $info->[1]; # TODO: should complete to exploit realization information
	  $info = $info->[0];
	}
	$newval = val($info);
      }
    }
    next if ($newval->{text} && exists $kind{$newval->{text}});
    $kind{$newval->{text}} = 1;
    push(@vals,$newval);
  }
  push(@pcas, TAG::Minus->new());
  return  [ 
	   TAG::Feature->new( { type => 'arg',
				f    => { features( 'kind' => [@vals],
						    'pcas' => [@pcas]
						    )
					  }
			    } ) 
	  ];
}

sub entry {
  my $parser = shift;
  my $cat    = shift;
  my $fs     = shift;
  my $lemma  = shift || undef;
  my $kind   = shift || undef;
  my $weight = shift || undef;
  my $morpho = shift || [];
##  print "ENTRY cat=$cat lemma=$lemma kind=$kind weight=$weight morpho=$morpho\n";
  $fs->{type} = $cat;
  return TAG::LemmaRef->new({ cat => $cat, feature => $fs });
}

sub TAG::LemmaRef::lexLP {
  my $self    = shift;
  my $fs2     = shift;
  my $name    = quote($self->{'name'});
  my $cat     = quote($self->{'cat'});
  my $lemma   = '_';
##  my $feature = dclone($self->{'feature'});
  my $feature = $self->{'feature'};
  my $ht;
  my $xlemma  = $self->{name};
  my $xcat    = $self->{cat};

  if ($feature) {
    
##      print Data::Dumper->Dump([$feature],['feature']);

    if (defined $fs2) {
      return unless (fs_unify($feature,$fs2));
    }

    if (defined $exclude_cat_feature{$xcat}) {
      foreach my $key (keys %{$exclude_cat_feature{$xcat}}) {
	delete $feature->{f}{$key};
      }
    }


    my @ctrkeys = grep(/^(\S+-)?(ssubj|subj|obj|acomp|vcomp|scomp|suj:sn|Suj|Obj|Att|Obl|Loc|DLoc)/,keys %{$feature->{f}});

    foreach my $key (@ctrkeys) {
    }

      delete $feature->{f}{$_} foreach @ctrkeys;

      if ($xcat eq 'v' && !defined $feature->{f}{pred}) {
	my $ht = TAG::Feature->new({type => 'ht',
				    f => {}
				   });
	$ht->{f}{arg0} = 
	  TAG::Comp->new(
			 { name => 'arg0',
			   rel => 'eq',
			   value => [ 
				     TAG::Minus->new()
				    ]
			 }
			);
	$ht->{f}{arg1} = 
	  TAG::Comp->new(
			 { name => 'arg1',
			   rel => 'eq',
			   value => [ 
				     TAG::Minus->new()
				    ]
			 }
			);
	$ht->{f}{arg2} = 
	  TAG::Comp->new(
			 { name => 'arg2',
			   rel => 'eq',
			   value => [ 
				     TAG::Minus->new()
				    ]
			 }
			);
	$feature->{f}{pred} = TAG::Comp->new( {name => 'pred',
					       rel => 'eq',
					       value => [$ht]} );
      }

    if(defined $feature->{f}{pred}) {
      $ht     = $feature->{f}{pred}{value}->[0];
##      print Data::Dumper->Dump([$ht->{f}{lemma}{value}->[0]],['ht']);
      if (defined $ht->{f}{lemma}) {
	$xlemma = $ht->{f}{lemma}{value}->[0]{'text'};
	$xlemma =~ s/_+(\d+)$//og; # normalizing lemma
	$lemma  = quote($xlemma);
	delete $ht->{f}{lemma};
	##      print "LEMMA $lemma\n";
      }
      
      ## dealing with true pronominal
      $ht->{f}{refl} = $feature->{f}{refl} if (exists $feature->{f}{refl});

      ## dealing with impersonels (active with no deep subjects)
      $ht->{f}{imp} = $feature->{f}{imp} if (exists $feature->{f}{imp});
      if ((exists $ht->{f}{imp})  
##	  && $ht->{f}{imp}{value}[0]->isa('TAG::Plus')
	  && $feature->{f}{imp}{value}[0]->isa('TAG::Plus')
	  && ( !(exists $feature->{f}{diathesis})  
	       || $feature->{f}{diathesis}{value}[0]->{text} ne 'passive'
	     )
	  &&   ( $ht->{f}{arg0}{value}[0]{f}{kind}{value}[0]->isa('TAG::Minus')
		 || ( $ht->{f}{arg0}{value}[0]{f}{kind}{value}[0]->isa('TAG::Val')
		      && $ht->{f}{arg0}{value}[0]{f}{kind}{value}[0]{text} ne 'subj'
		    ))
	 )  {
	 ## print "%% **** HANDLE IMPERSONEL ***\n";
	## if impersonel, we may have to shift arguments 
	for(my $i=1; $i > -1; $i--) {
	  if (exists  $ht->{f}{"arg$i"}) {
	    my $j = $i+1;
	    ## print "Shift arg$i -> arg$j\n";
	    my $x = $ht->{f}{"arg$i"};
	    $x->{name} = "arg$j";
	    $ht->{f}{"arg$j"} = $x;
	    delete  $ht->{f}{"arg$i"};
	  }
	}
	$ht->{f}{arg0} = TAG::Comp->new( 
					{ name => 'arg0',
					  rel => 'eq',
					  value => [
						    TAG::Feature->new(
								      { type => 'arg',
									f => { kind => TAG::Comp->new({ name => 'kind',rel=>'eq',value=>[TAG::Val->new({text=>'nosubj'})]})
									     }
								      })
						   ]
					}
				       );
      }


      if ($xcat eq "v" && !exists $feature->{f}{diathesis}) {
	$feature->{f}{diathesis} = TAG::Comp->new(
						  { name => 'diathesis',
						    rel => 'eq',
						    value => [ 
							      TAG::Val->new({text => 'active'})
							     ]
						  }
						 );
      }

      $ht->{f}{diathesis} = $feature->{f}{diathesis} if (exists $feature->{f}{diathesis});

      $ht = $ht->LP('ht');
      delete $feature->{f}{pred};
    }
    
    if (defined $feature->{f}{fct} 
	## need to check that expansion has not yet been done
	## because of interactions with memoization
	&& $feature->{f}{fct}{value}[0]->isa('TAG::Val') 
       ) {
      my $subcat = $feature->{f}{fct}{value};
#      print "FCT ",Data::Dumper->Dump([$subcat],['subcat']),"\n";
      $feature->{f}{fct} = TAG::Comp->new( {name => 'fct',
					    rel => 'eq',
					    value => subcat({real => $subcat})
					    } );
#      print "FCT2 ",Data::Dumper->Dump([$subcat],['subcat']),"\n";
    }

    ## do not handle support verbs (lightverbs) at this time
    ##    return  if (exists $feature->{f}{lightverb});


    if (defined $complete->{$xlemma}{$xcat}) {
      ##      print "%% Found completion entry $lemma $cat\n";
      my $cfs = $complete->{$lemma}{$xcat};
      ## Perform very limited unification by adding features not already present
      foreach my $key (keys %{$cfs->{f}}) {
	$feature->{f}{$key} = $cfs->{f}{$key} unless (exists $feature->{f}{$key});
      }
    }

    ## Special keys (syntactic ones that should not appear as morpho-syntactic features)
    delete $feature->{f}{cat};
    delete $feature->{f}{refl};	# reflexives
    delete $feature->{f}{imp};	# impersonels

    $feature = $feature->LP($cat);
  } else {
    $feature = '_';
  }
  $ht = '_' unless defined $ht;

  return { lemma  => $lemma, 
	   cat    => $cat, 
	   top    => $feature,
	   ht     => $ht,
	   xlemma => $xlemma
	 }
}

sub entry_print {
    my ($self,$lex,$left,$right,$nlex) = @_;
##    print "\%\%Entry print $lex $left $right\n";
    $lex  = quote($lex);
    $nlex =~ s/qu'$/que/o;	# Should be properly handled by the lexicon !
    $nlex = quote($nlex);
    my $lemma = ($self->{lemma} eq '_') ? $nlex : $self->{lemma};
##    $lemma =~ s/_+(\d+)$/_$1/og; # if we wish to keep a trace of which lemma

    print <<EOF
'C'($left, 
    lemma{ lex     => $nlex,
           truelex => $lex,
	   lemma   => $lemma,
	   cat     => $self->{cat},
	   top     => $self->{top},
	   anchor  => tag_anchor{ name => $self->{ht}, coanchors => [], equations => [] }
	 },
    $right
   ).

EOF
}

my $parser = new LefffDecoder();

read_macrosfile($macrosfile);

## Reading extra info to complete some entries
if (-r $completefile) {
  open COMP,"<",$completefile 
    ||  die "can't open complete file: $!";
  while (<COMP>) {
    next if m{^//};
    next if /^\s*$/;
    next if /^#/;
    chomp;
    my ($lemma,$cat,$fs) = split(/\t+/,$_,3);
##    print "Handling complete '$lemma' '$cat' '$fs'\n";
##    $parser->YYData->{INPUT}="$fs;\n";
    $fs = $parser->Run("$fs;");
    $complete->{$lemma}{$cat} = $fs;
##    print "Register complete info for $lemma $cat\n";
  }
  close COMP;
}

## Reading extra info for missing entries
if (-r $missingfile) {
  open COMP,"<",$missingfile 
    ||  die "can't open missing entry file: $!";
  while (<COMP>) {
    next if m{^//};
    next if /^\s*$/;
    next if /^#/;
    chomp;
    if (s/^\@adj\s+//) {
      my ($ms,$mp,$fs,$fp) = split(/\s+/,$_);
      my $lemma = "$ms<Suj:(sn),Objde:(de-sn|de-sinf|de-scompl),Obj:(-sn|-sinf|-scompl)>";
      my $xlemma = $ms . "____20";
      $mp="$ms$1" if ($mp =~ /^\+(\w*)/);
      $fs="$ms$1" if ($fs =~ /^\+(\w*)/);
      $fp="$ms$1" if ($fp =~ /^\+(\w*)/);

      ## print "REGISTER MISSING ADJ $lemma $ms $mp $fs $fp\n";

      push(@{$missing->{$ms}},"adj\t[pred='$lemma',cat=adj,\@ms]\t$xlemma\tDefault");
      push(@{$missing->{$mp}},"adj\t[pred='$lemma',cat=adj,\@mp]\t$xlemma\tDefault");
      push(@{$missing->{$fs}},"adj\t[pred='$lemma',cat=adj,\@fs]\t$xlemma\tDefault");
      push(@{$missing->{$fp}},"adj\t[pred='$lemma',cat=adj,\@fp]\t$xlemma\tDefault");

      push(@{$missing->{ucfirst($ms)}},"adj\t[pred='$lemma',cat=adj,\@ms]\tx$lemma\tDefault");
      push(@{$missing->{ucfirst($mp)}},"adj\t[pred='$lemma',cat=adj,\@mp]\t$xlemma\tDefault");
      push(@{$missing->{ucfirst($fs)}},"adj\t[pred='$lemma',cat=adj,\@fs]\t$xlemma\tDefault");
      push(@{$missing->{ucfirst($fp)}},"adj\t[pred='$lemma',cat=adj,\@fp]\t$xlemma\tDefault");

    } elsif (s/^\@nc\s+//) {
      my ($s,$p,$gender) = split(/\s+/,$_);
      my $lemma = "$s<Objde:(de-sn|de-sinf),Obj:(-sinf)>";

      $p="$s$1" if ($p =~ /^\+(\w*)/);

      push(@{$missing->{$s}},"nc\t[pred='$lemma',cat=nc,".$gender."s]\t$lemma\tDefault");
      push(@{$missing->{$p}},"nc\t[pred='$lemma',cat=nc,".$gender."p]\t$lemma\tDefault");

      push(@{$missing->{ucfirst($s)}},"nc\t[pred='$lemma',cat=nc,".$gender."s]\t$lemma\tDefault");
      push(@{$missing->{ucfirst($p)}},"nc\t[pred='$lemma',cat=nc,".$gender."p]\t$lemma\tDefault");

    } elsif (s/^\@redirect\s+//) {
      my ($old,$new,@others) = split(/\s+/,$_);
      my $cat = 1;
      $cat = $1 if ($new =~ s/__(\S+)$//o);
      foreach my $key ($new,@others) {
	$key = "$new$1" if ($key =~ /^\+(\S*)/);
	$redirect->{$old}{$key} = $cat;
	$redirect->{ucfirst($old)}{$key} = $cat;
      }
    } else {
      my ($form,$info) = split(/\t+/,$_,2);
      ##    print "Handling complete '$lemma' '$cat' '$fs'\n";
      ##    $fs = $parser->fs($fs);
      push(@{$missing->{$form}},$info);
      ##    print "Register complete info for $lemma $cat\n";
    }
  }
  close COMP;
}

## Reading extra info for restricting lefff entries
if (-r $restrfile) {
  open COMP,"<",$restrfile
    ||  die "can't open restr entry file: $!";
  while (<COMP>) {
    next if m{^//};
    next if /^\s*$/;
    next if /^#/;
    chomp;
    my ($form,@cats) = split(/\s+/,$_);
    ##    print "Handling complete '$lemma' '$cat' '$fs'\n";
    ##    $fs = $parser->fs($fs);
    ## print "Register restriction $form cats=@cats\n";
    $restrictions->{$form} = { map {$_ => 1} @cats };
    ##    print "Register complete info for $lemma $cat\n";
  }
  close COMP;
}



my @fifo = ();
my @pending = ();
my $nbsentence = 0;

my $lexin = '';
my $lexout = '';
my $h;
my %h;

if (0 && $config->utf8) {
  $h = start ['recode','u8..l1'],\$lexin,
    '|',\@lexed,
      '|',['recode','l1..u8'],\$lexout ;
# } elsif ($dbm){
#   my $db = tie %h, 'MLDBM', 
#     -Filename => "lefff.db",
#       or die "Cannot open database: $!";
} else {
  $h = start \@lexed,\$lexin,\$lexout;
}

my $dagmode    = 0;
my $dagmax     = 0;
my $dagfirst;
my %dag        = ();

my %mafstruct  = ();
my $mafcurrent = 0;


$|=1;


if ($config->test_decoder) {
##  use Data::Dumper;
  while(<>) {
    chomp;
##    print "Entry parse: $_\n";
    my $res = fs_restrict(entry_parse($_));
##    print Data::Dumper->Dump([$res],['entry']);
  }
# } elsif ($config->compile) {
# ##  use BerkeleyDB;
#   my %h;
#   my $db = tie %h, 'MLDBM',
#     -Filename => "lefff.db",
#       -Flags => DB_CREATE
# 	or die "Cannot open database: $!";
#   my $n=0;
#   my @res = ();
#   my $key;
#   while(<>) {
#     chomp;
#     ## Do some corrections (tmp)
#     s/\@+/\@/og;
#     s/`red=/pred=/og;
#     ##    print "Entry parse: $_\n";
#     my ($id,$lex) = ($_ =~ /^(.+?)\t+(.*)/);
#     if ($key && $id ne $key) {
#       $h{$key} = [@res];
#       @res=();
#     }
#     $key=$id;
#     my $res = fs_restrict(entry_parse($lex));
#     ##    print Data::Dumper->Dump([$res],['entry']);
#     push( @res, $res );
#     $n++;
#     unless ($n % 10000) {
# ##      exit;
#       print STDERR "$n\n";
#     }
#   }
#   $h{$key} = [@res];
#   undef $db;
#   untie %h;
} elsif ($config->sxpipe) {
  my $sxpipecmd = $config->sxpipecmd();
  my $cmd = "$sxpipecmd | dag2udag";
##  $cmd = "recode u8..l1 | $cmd | recode l1..u8" if ($config->utf8);
  while (<>) {
    if (/^##\s*DAG\s+BEGIN\s*(.*)/oi) {
      line_handler(\$_);
      while (<>) {
	line_handler(\$_);
	last if (/^##\s*DAG\s+END/oi);
      }
    } elsif (/^##\s*MAF\s+BEGIN/oi) {
      my $maf = "";
      while (<>) {
	last if /^##\s*MAF\s+END/oi;
	$maf .= $_;
      }
      $dagmax = 0;
      %mafstruct = ();
      $mafcurrent = 0;
      sentence_start($1);
      ## Handle $maf
      maf_handle($maf);
      sentence_end($dagmax);
      $dagmax = 0;
      %mafstruct = ();
    } elsif (/^<\?xml/oi) {
      my $maf = $_;
      while (<>) {
	$maf .= $_;
	last if m{^</maf>}oi;
      }
      $dagmax = 0;
      %mafstruct = ();
      $mafcurrent = 0;
      sentence_start($1);
      ## Handle $maf
      maf_handle($maf);
      sentence_end($dagmax);
      $dagmax = 0;
      %mafstruct = ();
    } else {
      my $lpid = open2(*OUT,*IN,$cmd);
      #from_to($_,"utf8","iso-8859-1") if $utf8;
      ## TMP: time to fix 'est-ce' in text2dag (Feb 28 2007)
      $_ =~ s/\b([Ee]st)(-ce)\b/$1 ce/og;
      print IN $_;
      close(IN);
      while (<OUT>) {
	#from_to($_,"iso-8859-1","utf8") if $utf8;
	##	print "%% Got $_";
	line_handler(\$_);
      }
      close OUT;
      waitpid $lpid, 0;
    }
  }
} else {
  while (<>) {
    line_handler(\$_);
  }
}

sub line_handler {
  my $lref = shift;
  if ($$lref =~ /##\s*DAG\s+BEGIN\s*(.*)/) {
    $dagmode ||= 1;
    $dagmax   = 0;
    $dagfirst = undef;
    %dag      = ();
    sentence_start($1);
    return;
  };
  if ($dagmode && $$lref =~ /##\s*DAG\s+END\b/) {
    sentence_end($dagmax);
    $dagmax  = 0;
    $dagmode = 0;
    %dag     = ();
    return;
  };
  if ($dagmode) {
    dag_decode($$lref);
  } else {
    segmenter($$lref);
  }
}

$h->finish unless ($dbm);

sub sentence_start {
  my $sentence = shift || '';
  @fifo = ();
  $nbsentence++;
  if ($expand) {
  print <<EOF;
%% Token database generated by $0
%% Sentence $nbsentence $sentence

EOF
} else {
  print "%% $sentence\n";
}
} 

sub sentence_end {
  my $n = shift;
  foreach my $left (sort {$a <=> $b} keys %dag) {
    foreach my $right (sort {$a <=> $b} keys %{$dag{$left}}) {
      foreach my $data (@{$dag{$left}{$right}}) {
	handle_word($data) if (defined $data);
      }
    }
  }
  while (@fifo) {
    my $word = shift @fifo;
    handle_word($word) unless ($word->{sent});
  }
  print <<EOF if ($expand);

'N'($n).

EOF

  print <<EOF;
%% SENTENCE DIV %%
%%%EOF

EOF

}

sub handle_date {
  my $date    = shift;
  my $lemma   = quote(shift);
  my $left    = shift;
  my $right   = shift;
  my $truelex = quote(shift || $date);
  $date       = quote($date);
  print <<EOF if ($expand);
'C'($left,
    lemma{ lex     => $date,
           truelex => $truelex,
           cat     => nc,
           top     => nc{gender => masc, number => sg, time => (+)},
           lemma   => $lemma
         },
    $right
   ).

EOF
}

sub handle_number {
  my $xnumber = shift;
  my $number  = quote($xnumber);
  my $lemma   = quote(shift);
  my $left    = shift;
  my $right   = shift;
  my $gender  = shift || '_';
  my $truelex = shift || $xnumber ;
  send_word('_NUMBER',$truelex,"pro|nc|adj",$left,$right,'keep',undef);
  $truelex = quote($truelex);
  print <<EOF if ($expand);
'C'($left,
    lemma{ lex     => $number,
           truelex => $truelex,
           cat     => number,
           top     => number{ gender => $gender },
           lemma   => $lemma
         },
    $right
   ).

EOF
}

sub handle_ordinal {
  my $lex     = shift;
  my $lemma   = quote(shift);
  my $left    = shift;
  my $right   = shift;
  my $number  = shift || '_';
  my $truelex = quote(shift || $lex);
  $lex = quote($lex);
  print <<EOF if ($expand);
'C'($left,
    lemma{ lex   => $lex,
           lex   => $truelex,
           cat   => adj,
           top   => adj{ number => $number },
           lemma => $lemma
         },
    $right
   ).

EOF
}

sub handle_word {
  my $data         = shift;
  my $lex          = $data->{lex};
  my $truelex      = $data->{truelex};
  my $ultratruelex = $truelex;
  $ultratruelex    =~ s/^(E\d+(?:\.\d+)?F\d+\|)//o;
##  print "Handling word lex='$lex' truelex=$truelex ultralex='$ultratruelex'\n";
  my $lutlex         = lc($ultratruelex);
  my $info;
  my $fs = $data->{fs};
  my $llex=lc($lex);
  my @lutlex = split(/['-]/,$lutlex);

  if ($lex ne $ultratruelex 
      && $lex ne $lutlex
      && !(defined $data->{prefix} || defined $data->{suffix})
      && defined $missing->{$ultratruelex}) {
    $lex = $ultratruelex;
    if (!exists $cachelexed{$lex}{$ultratruelex}{$fs}) {
      my @info=@{$missing->{$lex}};
      $info = $cachelexed{$lex}{$ultratruelex}{$fs} = 
	fs_restrict(entry_parse(@info),$fs);
    }
  } elsif ($lex ne $ultratruelex
	   && $lex ne $lutlex
	   && $lex !~ /-$/
	   && defined $data->{prefix}
	   && $lutlex =~ /^(\S+)-/
	   && (!grep {$llex eq $_} @lutlex)
	   && defined $missing->{$1}) {
    $lex = $1;
    if (!exists $cachelexed{$lex}{$ultratruelex}{$fs}) {
      my @info=@{$missing->{$lex}};
      $info = $cachelexed{$lex}{$ultratruelex}{$fs} = 
	fs_restrict(entry_parse(@info),$fs);
    }
  } elsif ($lex ne $ultratruelex
	   && $lex ne $lutlex
	   && $lex !~ /^-/
	   && defined $data->{suffix}
	   && $lutlex =~ /-(\S+)$/
	   && (!grep {$llex eq $_} @lutlex)
	   && defined $missing->{$1}) {
    $lex = $1;
    if (!exists $cachelexed{$lex}{$ultratruelex}{$fs}) {
      my @info=@{$missing->{$lex}};
      $info = $cachelexed{$lex}{$ultratruelex}{$fs} = 
	fs_restrict(entry_parse(@info),$fs);
    }
  }
  if (!$dbm and !exists $cachelexed{$lex}{$ultratruelex}{$fs}) {
    my $tmp = "$lex\n";
    #from_to($tmp,"utf8","iso-8859-1") if $utf8;
    $lexin = $tmp;
    ## print "SEND INFO TO LEXED $lexin\n";
    $h->pump until ($lexout =~ /\n/);
    ## print "GOT INFO FROM LEXED $lexout\n";
    my $tmpinfo = $lexout;
    #from_to($info,"iso-8859-1","utf8") if $utf8;
    ## print "Translated into $info\n";
    chomp $tmpinfo;
    $lexout = '';
    my @info = ();
    push(@info,$tmpinfo) if ($tmpinfo);
    push(@info,@{$missing->{$lex}}) if (defined $missing->{$lex});
    $info =
      fs_restrict(entry_parse(@info),$fs);
    $cachelexed{$lex}{$ultratruelex}{$fs} = $info if ($countlexed{$lex}++ > 5);
  }
#   if ($dbm and !exists $cachelexed{$lex}{$ultratruelex}{$fs}) {
# #    print "Try lex='$lex'\n";
#     my @res = @{$h{$lex}};
# #    print Data::Dumper->Dump([\@res],['res']);
#     my @info = ();
#     push(@info,@{$missing->{$lex}}) if (defined $missing->{$lex});
#     my $res2 = @info ? entry_parse(@info) : [];
#     $info = $cachelexed{$lex}{$ultratruelex}{$fs} = 
#       [map(@$_,@res),@{fs_restrict($res2,$fs)}];
#   }
  $info ||= $cachelexed{$lex}{$ultratruelex}{$fs};
  my $cat = $data->{cat};
  my %cat = ();
  if (defined $cat) {
    $cat{$_} ||= 1 foreach (split(/\|/,$cat));
  }
  my $left   = $data->{left};
  my $right  = $data->{right};
  my $status = $data->{status};
  my $altbox = $data->{altbox};
##  print "%% lexed handling '$lex' '$truelex' cat='$cat' status=$status: $info\n";
  if ($info =~ /<unknown>/ ) {
    return if ($status eq 'discard');
    return if (defined $altbox  && $altbox->{found});
    return unless ($expand);
    unless (defined $cat) {
      if ($lex eq '_Uw' || ($lex =~ /^[[:upper:]]/ && $lex !~ /\b-[[:lower:]]/)) {
	$cat = 'np';
      } else {
	$cat = 'cat[v,nc,adv,adj]';
	$unknown{$lex} = 1;
      }
    }
    $lex = quote($lex);
    $truelex = quote($truelex);
    print <<EOF;
'C'($left, 
    lemma{ lex => $lex, truelex => $truelex, cat => $cat },
    $right
   ).
/*
'C'($left, 
    lemma{ lex => $lex, truelex => $truelex, cat => v },
    $right
   ).
'C'($left, 
    lemma{ lex => $lex, truelex => $truelex, cat => nc },
    $right
   ).
'C'($left, 
    lemma{ lex => $lex, truelex => $truelex, cat => adv },
    $right
   ).
'C'($left, 
    lemma{ lex => $lex, truelex => $truelex, cat => adj },
    $right
   ).
*/
EOF
    return;
  }
  return if (defined $altbox && $altbox->{found} == 1 && $data->{status} eq 'discard_but');
  if ($expand) {
##    my $entries = fs_restrict($info,$data->{fs});
    my $entries = $info;
    foreach my $entry (@$entries) {

##           print "TEST  entry elemma=$entry->{xlemma} ecat=$entry->{cat} dlex=$lex dlemma=$data->{lemma}\n";
##      print "  with data lemma=$data->{entry}\n";

      ##      my $lemma = ($entry->{lemma} eq '_' ? $lex : unquote($entry->{lemma}));
      my $lemma = $entry->{xlemma} || $lex;

      next if (defined $cat && !exists $cat{$entry->{cat}});
      next if (defined $restrictions->{$lex} && exists $restrictions->{$lex}{$entry->{cat}});
      next if (defined $restrictions->{"\@$lemma"} && exists $restrictions->{"\@$lemma"}{$entry->{cat}});
      next if (exists $data->{entry} && $data->{entry} && $data->{entry} ne $lemma);
      $altbox->{found} ||= 1 if (defined $altbox);
      entry_print($entry,$truelex,$left,$right,$lex);
    }
  } else {
    print "$data->{truelex}\n";
  }
}

sub entry_parse {
  my @entries = @_;
##  print "Entry parse: $entries\n";
##  $parser->YYData->{INPUT}="$entries\n";
  my $entries = join('|',@entries);
  return $entries if ($entries =~ /<unknown>/);
  my @fields = split(/\t/,$entries);
  my $results = [];
  while (@fields) {
    my $weight = shift @fields;
    $weight ||= 1;
    my $cat;
    if ($weight =~ /^\d+(\.\d+)?$/) {
      $cat = shift @fields;
    } else {
      $cat = $weight;
      $weight = 1;
    }
    $cat = normalize_cat($cat);
    my $fs = shift @fields;
    my $lexid = shift @fields;
    my $kind = shift @fields;
    my $morpho;
    my $info;
    if ($kind && $kind =~ /(\S*)\|(\S*)/) {
      $kind = $1;
      unshift(@fields,$2);
    } else {
      $morpho = shift @fields;
      if ($morpho &&  $morpho =~ /(\S*)\|(\S*)/) {
	$morpho = $1;
	unshift(@fields,$2);
      } else {
	$info = shift @fields;
	if ($info &&  $info =~ /(\S*)\|(\S*)/) {
	  $info = $1;
	  unshift(@fields,$2);
	} 
      }
    }
    0 && print <<EOF;
Handling
   w='$weight'
   cat='$cat'
   fs='$fs'
   lexid='$lexid' 
   kind='$kind'
   morpho='$morpho'
   info='$info'
EOF
    next unless ($cat and $fs);
    my $res  = $parser->Run("$fs;");
    0 && print Data::Dumper->Dump([$res],['entry']);
   push(@$results,entry($parser,$cat,$res,$lexid,$kind,$weight,$morpho));
  }

##  print Data::Dumper->Dump([$res],['entry']);
  return $results;
}

sub fs_restrict {
  my $entries = shift;
  my $fs = shift;
  return $entries if ($entries =~ /<unknown>/);
  return [ map($_->lexLP($fs),@$entries) ];
}

END {
  if (%unknown) {
    my $file = $config->tmpdir()."/biomg.unknown";
    if (-r $file) {
      if (open(UNKNOWN,"<$file")) {
	foreach my $word (<UNKNOWN>) {
	  chomp $word;
	  delete $unknown{$word};
	}
	close(UNKNOWN);
      } else {
	warn "can't open $file: $!";
      }
    }
    if (open(UNKNOWN,">>$file")) {
      print UNKNOWN "$_\n" foreach (sort keys %unknown);
      close(UNKNOWN);
    } else {
      warn "can't open $file: $!";
    }
  }
}

sub segmenter {
  my $line = shift;
  my $i = 0;

  sentence_start($line);

  $line =~ s/([.]+)(?![:\w])/ $1/og;
  $line =~ s/([!?]+)(?!:)/ $1/og;
  $line =~ s/([\(\]+)/$1 /og;
  $line =~ s/([,;?\\)])(?!:)/ $1/og;
  $line =~ s/"/ " /og;
  $line =~ s/:\s+/ : /og;
##  $line =~ s/\bl'on\b/on/ogi;
  $line =~ s/((?:(?:^|\s)\w|qu)\')/$1 /ogi;
  $line =~ s/\b-t-(il|elle|on|y|ils|elles)\b/ -t-$1/ogi;
  $line =~  s/\B([td])-(il|elle|on|y|ils|elles)\b/$1 -$2/ogi;
  $line =~ s/\B-je\b/e -je/ogi;
  $line =~ s/(?<!-t)-(je|tu|nous|vous|ce|ci|mme|le|la|lui|moi|toi|leur|on|en)
	     (?=$|\s+|-(?:le|la|lui|moi|toi|leur|on|en))
	     / -$1/ogix;
  $line =~ s/\b-(ci|l)\b/ -$1/ogi;

 ## $line =~ s/\b($agglpat)\b/$agglutinates{lc($1)}/oig;
 ##  $line =~ s/\b($agglpat)-/$agglutinates{lc($1)}-/oig;

  ##      print STDERR $line;
  ##      $line =~ //g;
  ##  print "%% LINE '$line'\n";
  
  my $origlex = "";

 LOOP: {
    
##    print "%% HERE $i\n";

    redo LOOP if ($line =~ /\G\s+/gc);
    
    last LOOP if ($line =~ /\G\#/gc || $line =~ /\G$/gc);
    my $left = $i;
    my $right = ++$i;

    if ($line =~ /\G$lexinfo/gc) {
      $origlex = clean_comment($1);
##      $origlex =~ s/^(\S+?):/$1\t/o;
##      print "ORIGLEX $origlex\n";
      --$i;
      redo LOOP;
    }


    ## Titles
    if ($line =~ /\G($RE->{title})/gc) {
      my ($title,@comment) = extract_comment($1);
      my $truelex = $origlex ? join(' ',$origlex,@comment) : $title;
      $right += scalar(my @tmp = split(/\s+/,$title)) - 1;
      $i = $right;
      $title =~ s/\s+\./\./og;
      send_word($title,$truelex,'nc',$left,$right,'keep');
      redo LOOP;
    }

    ## Abbreviations
    if ($line =~ /\G($RE->{abbrev})/gc) {
      ##      print "FOUND ABBREV $1\n";
      my ($abbrev,@comment) = extract_comment($1);
      my $truelex = $origlex ? join(' ',$origlex,@comment) : $abbrev;
      $i = $right += scalar(my @tmp = split(/\s+/,$abbrev)) - 1;
      $abbrev =~ s/\s+\././og;
      send_word($abbrev,$truelex,'np',$left,$right,'keep');
      redo LOOP;
    }

    ## Tel
    if ($line =~ /\G($RE->{tel})/gc) {
      ##      print "FOUND TEL $1\n";
      my ($tel,@comment) = extract_comment($1);
      my $truelex = $origlex ? join(' ',$origlex,@comment) : $tel;
      $i = $right += scalar(my @tmp =  split(/\s+/,$tel)) - 1;
      $tel =~ s/\s+\./\./og;
      send_word('_TEL',$truelex,undef,$left,$right,'keep');
      redo LOOP;
    }

    ## Email
    if ($line =~ /\G($RE->{email})\b/gc) {
      ##      print "FOUND EMAIL $1\n";
      my ($email,@comment) = extract_comment($1);
      my $truelex = $origlex ? join(' ',$origlex,@comment) : $email;
      $i = $right += scalar(my @tmp = split(/\s+/,$email)) - 1;
      $email =~ s/\s+\./\./og;
      send_word('_EMAIL',$truelex,undef,$left,$right,'keep');
      redo LOOP;
    }



    ## Roman Number
    if ($line =~ /\G([IVXLCDM]{2,}(:?e|eme(:?s)))(\s+|$)/gc) {
      my $ordinal = $1;
      my $truelex = $origlex || $ordinal;
      my $number = ($ordinal =~ /s$/) ? 'pl' : 'sg';
      handle_ordinal($ordinal,'ROMAN_NUMBER',$left,$right,$number,$truelex);
      redo LOOP;
    }
   
    ## Dates
    if ($line =~ /\G($RE->{date})\b/gc) {
      my ($date,@comment) = extract_comment($1);
      my $truelex = $origlex ? join(' ',$origlex,@comment) :  $date;
      $i = $right += scalar(my @tmp = split(/\s+/,$date)) - 1;
      handle_date($1,'DATE',$left,$right,$truelex);
      redo LOOP;
    }
    
    ## Ordinals
    if ($line =~ /\G($RE->{ordinal})\b/gc) {
      my ($ordinal,@comment) = extract_comment($1);
      my $truelex = $origlex ? join(' ',$origlex,@comment) : $ordinal;
      $i = $right += scalar(my @tmp = split(/\s+/,$ordinal)) - 1;
      my $number = ($ordinal =~ /s$/) ? 'pl' : 'sg';
      handle_ordinal($ordinal,'ORDINAL',$left,$right,$number,$truelex);
      redo LOOP;
    }

    ## Other numbers
    if ($line =~ /\G($RE->{number})\b/gc) {
      my ($number,@comment) = extract_comment($1);
      my $truelex = $origlex ? join(' ',$origlex,@comment) : $number;
      $i = $right += scalar(my @tmp = split(/\s+/,$number)) - 1;
      my $gender = ($number =~ /une$/) ? 'fem' : '_';
      handle_number($number,'_NUMBER',$left,$right,$gender,$truelex);
      redo LOOP;
    }

    if ($line =~ /\G:\s+/gc) {
      my $truelex = $origlex || ':';
      send_word(':',$truelex,undef,$left,$right,'keep',undef);
      redo LOOP;
    }

    ## agglutinates
    if ($line =~ /\G((?:\S+_)?)($agglpat)\b/gc) {
        my $pref = $1;
        my $word = $2;
	my $comment = $origlex ? "$origlex" : "";
        my @words = split(/\s+/,$agglutinates{lc($word)});
	$words[0] = "$pref$words[0]";
        $i = $right += @words - 1;
	my $altbox ||= { found => 0 };
	cut_word($comment,"$pref$word",$left,$right,$altbox,'discard');
	for (my $j = $left; @words ; $j++) {
	  my $word = shift @words;
	  cut_word($comment,$word,$j,$j+1,$altbox);
	  $comment =~ s/(\S+?)\|(.*)/$1\|*/o;
	}
        redo LOOP;
    }

    ## standard tokens
    if ($line =~ /\G(\S+)/gc){
      cut_word($origlex,$1,$left,$right);
      redo LOOP;
    }
  }

  sentence_end($i);

}

sub extract_comment {
  my $s = shift;
  my @s = split(/$lexinfo/,$s);
  my @w = ();
  my @comment = ();
  while (@s) {
    push(@w,shift @s);
    push(@comment,clean_comment(shift @s)) if (@s);
  }
  return (join(' ',@w),@comment);
}

sub clean_comment {
  my $s = shift;
  $s =~ s/\s+//og;
  return $s;
}

sub cut_word {
  my ($truelex,$xlex,$left,$right,$altbox,$status) = @_;
  my ($lex,$cat,$ht,$top) = split(':',$xlex);
##  print "LEX $lex $truelex\n";
  $truelex ||= $lex;
##  $lex =~ s/(?<!_)_(?!_)/ /o;
  $lex =~  s/([^_-])_([^_-])/$1 $2/og;
  $lex =~ s/'\s+/'/o;
  unless ($status) {
    $status = 'keep';
    $status = 'discard' if ($lex =~ m{/\w+/});
  }
  if ($left == 0 && $lex =~ /^[[:upper:]]/) {
    my $llex = lc($lex); 
    $altbox ||= { found => 0 };
    if ($llex eq 'a') {
      send_word('',$truelex,$cat,$left,$right,'discard',$altbox);
    }
    ##        my $lcat = $cat || "det|pri|prep|adv";
    send_word($llex,$truelex,$cat,$left,$right,'discard',$altbox);
    send_word($lex,$truelex,$cat,$left,$right,'keep',$altbox);
  } else {
    send_word($lex,$truelex,$cat,$left,$right,$status,$altbox);
  }
}

sub send_word {
  my ($lex,$truelex,$cat,$left,$right,$status,$altbox) = @_;
  ##  print "%% preparing sending $lex: @fifo\n";
  my @first = ($#fifo > 5 ? @fifo[0 .. 4] : @fifo);
  foreach my $word (@first) {
    ##    print "%% trying merge\n";
    if ($word->{right} == $left) {
      my $newlex = "$word->{lex} $lex";
      $newlex =~ s/'\s+/'/og;
      if (exists $locutions{$newlex} || exists $locutions{lc($newlex)}) {
	my $newtruelex = "$word->{truelex} $truelex";
	$newtruelex =~ s/' /'/og;
	send_word($newlex,$newtruelex,undef,$word->{left},$right,'discard',$word->{altbox});
      } elsif (!$word->{sent}) {
	$word->{sent} = 1;
	handle_word($word);
      }
    }
  }
##  print "%% sending $lex ($truelex)\n";
  my $locution = (exists $locutions{$lex}) || (exists $locutions{lc($lex)});
  my $data = {
	      lex     => $lex,
	      truelex => $truelex,
	      cat     => $cat,
	      left    => $left,
	      right   => $right,
	      status  => $status,
	      altbox  => $altbox,
	      sent    => 0
	     };
  if ($locution) {
    ## we wait until next word before deciding trying $lex
    $data->{altbox} = $altbox ||= {found => 0 };
##    $data->{status} = 'discard';
    @fifo = waiting_word($data,@fifo);
    return;
  }
  if ($lex =~ /^(?:$prefpat)([^-]+)/) {
    my $seed = $1;
##    print "Try compound $lex seed=$seed\n";
    $data->{altbox} = $altbox ||= {found => 0};
    $data->{status} = 'discard';
    handle_word($data);
    my $seeddata = {
		    lex     => $seed,
		    truelex => $truelex,
		    cat     => $cat,
		    left    => $left,
		    right   => $right,
		    status  => 'discard_but',
		    altbox  => $altbox,
		    sent    => 0
		   };
    handle_word($seeddata);
  } else {
    handle_word($data);
  }
}

sub waiting_word {
  my ($word,@l) = @_;
  if (@l && $word->{left} == $l[0]->{left} &&  $word->{right} == $l[0]->{right}) {
    my $x = shift @l;
    return ($x,waiting_word($word,@l));
  } else {
    return ($word,@l);
  }
}

sub read_macrosfile {
  my $macrosfile = shift;
  open(MACRO,"<$macrosfile") || die "can't open $macrosfile";
  while (<MACRO>) {
    next unless /^\@/;
    my $line = $_;
    $line =~ s{(/{2}.*)$}{};
    chomp $line;
    my ($name,$body) = split(/\s*=\s*/,$line,2);
    next unless ($body =~ /^\[/);
    ##   print "PARSING MACRO $name $body\n";
    ##    my $fs = $parser->macrodef(\$body);
    ##    print dump($fs->{f});
    ##    $macro{$name} = [ %{$fs->{f}} ];
    $macro{$name} = $body;
  }
  close(MACRO);
  return %macro;
}

sub macro_expand ($) {
  my $macro = shift;
  return [] unless exists $macro{$macro};
##  $parser->YYData->{INPUT} = "$macro{$macro}";
##  print "MACRO EXPAND $macro $macro{$macro}\n";
  my $parser = new LefffDecoder();
  my $fs = $parser->Run("$macro{$macro}");
##  print "expand macro $macro: $macro{$macro} -> $fs\n";
  return [ %{$fs->{f}} ];
}

sub quote {
  my $string = shift || "";
  return $string if ($string =~ /^\d+$/);
##  return $string if ($string =~ /^[a-z]\w*$/o);
  return $string if ($string =~ /^[a-z][a-zA-Z]*$/o);
  $string =~ s/\'/\'\'/og;
  # $string =~ s/\'(?!\')/\'\'/og;
  return "\'$string\'";
}

sub unquote {
  my $string = shift || "";
  $string =~ s/^'(.*)'$/$1/o;
  return $string;
}

sub parse_map_file {
    my $map = shift;
    my $file = shift;
    open("MAP","<$file") || die "can't open map file $file";
    while (<MAP>) {
        chop;
        next if (/^\#/);
        next if (/^\s*$/);
        $map->{$1}{$2} = $3, next if (/^(\w+)\s+(\S+)\s*:\s*(.+)\s*$/);
        $map->{$1} = $2, next if (/^(\w+)\s*:\s*(.+)\s*$/);
    }
    close(MAP);
}

sub dag_decode {
  my $edge = shift;

  my ($left,$comment,$token,$right) = ($edge =~ /^(\d+)\s+\{(.*?)\}\s+(\S+)\s+(\d+)/);
  $token =~ s/^"(\S+)"$/$1/o;   ## remove quotes introduced by dag2udag
  $token =~ s/\\(["?+])/$1/og;
  my $xcomment;
  if ($comment =~ /<F/) {
    ## we are in a DAG generated by sxpipe: beware of positions that starts at 1
    ## => need to decrement because DyALog likes to have DAG pos starting at 0

    $left--;
    $right--;
    my @comments = ();
    while ($comment =~ m{<F\s+id="(.+?)"\s*>\s*(.*?)\s*</F>}og) {
      my ($id,$w) = ($1,$2);
      $w =~ s/\\(["?+])/$1/og;
      my $idw = ($confcomment) ? "$id|$w" : "$w";
      push(@comments,$idw) unless grep( $_ eq $idw,@comments);
    }
    if ($token eq '_URL') {
      $comment = join('',@comments);
    } else {
      $comment = join(' ',@comments);
    }
  }
  $xcomment = $comment;
  $xcomment =~ s/E\d+(?:\.\d+)?F\d+\|//og;
  my $cat;
  if ($token =~ /^(\S+)__(\S+?)$/) {
      $token = $1;
      $cat = $2;
  }
  if (($comment eq $xcomment) && $comment =~ /\w'\w/) {
    ## if tokens are not identified by easy indices
    ## we try to get something more readable in case
    ## of apostrophes
    ## *** WARNING *** not full proof
    ## in particular when mixing ' and - (l'anti-stress)
    my ($a,$b) = split(/'/,$comment);
    $a = "$a'";
    my $tmp = $token;
    $tmp =~ s/-_$/-/o;
    if ($token eq $a || $token eq lc($a)) {
      $comment = $a;
    } elsif ($token eq $b || $token eq lc($b)) {
      $comment = $b;
    } elsif ($b =~ /\L\Q$tmp/) {
      $comment = $b;
    }
  }
  ## '_' inside words are replaced by spaces unless the word starts with '_'
  unless ($token =~ /^_[A-Z]/) {
    $token =~ s/(?<![_-])_(?![_-])/ /og;
    $token =~ s/^ /_/o;
    $token =~ s/ $/_/o;
  }
  my $data = { lex => $token,
	       truelex => $comment,
	       left => $left,
	       right => $right,
	       status => 'keep'
	     };
##  print Data::Dumper->Dump([$data],['data']);
  $data->{cat} = $cat if (defined $cat);
  $dagmax = $right if ($right > $dagmax);
  if (@fifo && $fifo[0]->{left} == $left) {
##    unshift @fifo;
    ## print "Removing '$fifo[0]->{lex}'\n";
    shift @fifo;
  }
  if ($left == 0 && $token eq '_Uw') {
    ##    print "Pushing '$token' for delay\n";
    push(@fifo,$data) unless ($dagfirst);
  } else {
    ## print "Handling '$token'\n";
    $dagfirst = $token if ($left == 0);
##    handle_word($data);
    if (exists $redirect->{$xcomment}) {
      foreach my $new (keys %{$redirect->{$xcomment}}) {
	my $cat = $redirect->{$xcomment}{$new};
	my $newdata = { lex => $new,
			truelex => $comment,
			left => $left,
			right => $right,
			status => 'keep'
		      };
	$newdata->{cat} = $cat unless ($cat == 1);
##	print "Redirect for $xcomment pushes $new\n";
	push(@{$dag{$left}{$right}},$newdata);
      }
    } else {
      push(@{$dag{$left}{$right}},$data);
    }
    unless ($left == 0) {
      ## tmp fix to solve some problemes due to SXPIPE on compound words (prefixes, suffixes, ..)
      my $xleft = $left - 1;
      foreach my $pdata (@{$dag{$xleft}{$left}}) {
	if ($pdata->{truelex} eq $comment) {
	  $_->{prefix} = 1 foreach (@{$dag{$xleft}{$left}});
	  $_->{suffix} = 1 foreach (@{$dag{$left}{$right}});
	  if (exists $missing->{$xcomment}) {
	    $dag{$xleft}{$left} = [];
	    $dag{$left}{$right} = [];
##	  }
##	  unless ($xcomment =~ /\'/ || exists $agglutinates{lc($xcomment)}) { 
	    my $newdata = { lex => $missing->{$xcomment},
			    truelex => $comment,
			    left => $xleft,
			    right => $right,
			    status => 'keep'
			  };
##	    print "Agglutinates: pushing $missing->{$xcomment} at $xleft\n";
	    push(@{$dag{$xleft}{$right}},$newdata);
	  }
	  last;
	}
      }
    }
  }
}

## MAF Handling

sub maf_handle {
  my $maf = shift;

  require XML::Twig;

  my $twig = XML::Twig->new(
			    twig_handlers   => {
				              token          => \&handle_maf_token,
					      transition     => \&handle_maf_transition,
					      'maf/wordForm' => \&handle_maf_wordform,
					      'maf/wfAlt'    => \&handle_maf_wfAlt
					     },
			    pretty_print    => 'indented',
			    output_encoding => 'ISO-8859-1',
			    keep_encoding   => 1
			   );
  $twig->parse($maf);
  $twig->purge;
}

sub handle_maf_token {
  my ($t,$token) = @_;
##  print "MAF TOKEN ".$token->att("id")." ".($token->att("value") || $token->trimmed_text)."\n";
  $mafstruct{tokens}{$token->att("id")} = $token->att("value") || $token->trimmed_text;
}

sub normalize_lingua_tagvalue {
  my ($f,$v) = @_;
  $f = $lingua_feature_transfer{$f} || $f;
  if ($lingua_value_expand{$f}) {
    my $type = Lingua::Features::FeatureType->type($f);
    $v = $type->value_name($v) || $v;
  }
  return normalize_value($v);
}

sub handle_maf_transition {
  my ($t,$trans) = @_;
  my $left  = $trans->att("source");
  my $right = $trans->att("target");
  my $wf    = $trans->first_child("wfAlt");
  $trans    = $wf if (defined $wf);
  foreach my $wf ($trans->children("wordForm")) {
    handle_maf_wordform($t,$wf,$left,$right);
  }
}

sub handle_maf_wfAlt {
  my ($t,$alt) = @_;
  my $save = $mafcurrent;
  foreach my $wf ($alt->children("wordForm")) {
    handle_maf_wordform($t,$wf);
    $mafcurrent = $save;
  }
  $mafcurrent=$save+1;
}

sub handle_maf_wordform {
  my ($t,$wf,$left,$right) = @_;
##  $left =~ s/[^\d]+//og;
##  $right =~ s/[^\d]+//og;
##  print "MAF left=$left right=$right current=$mafcurrent\n";
  if (defined $left) {
##    $left = $mafstruct{state}{$left} ||= $mafcurrent;
##    $right = $mafstruct{state}{$right} ||=  $left+1;
    $left  =~ s/[^\d]+//og;
    $right =~ s/[^\d]+//og;
  } else {
    $left  = $mafcurrent;
    $right = $left+1;
  }
  $mafcurrent = $right;
  my @tokens = split(/\s+/,$wf->att("tokens"));
  my $entry  = $wf->att("entry") || "";
  $entry =~ s/^urn:.*?://;
  $entry =~ s/_\d+$//;
##  $entry =~ s/ /_/og;
  my $truetoken = join(" ",map($mafstruct{tokens}{$_},@tokens));
  my $token = $wf->att("form") || $truetoken;
  $token =~ s/([^_-])_([^_-])/$1 $2/og;
  ## print "MAF TRANS $left $right @tokens $token $entry\n";
  my $fs = {};
  ##  print "Tokens @tokens => $token\n";
  if (defined $wf->child("fs")) {
    my $xmlfs= $wf->child("fs");
    foreach my $f ($xmlfs->children("f")) {
      my $k = $f->att("name");
      my @v = ();
      ##      print "Found ".$f->gi." $k\n";
      foreach ($f->descendants("symbol")) {
	my $value = $_->att("value");
	next unless ($value);
	push(@v,normalize_value($value));
      }
      foreach ($f->descendants("numeric")) {
	my $value = $_->att("value");
	next unless ($value);
	push(@v,normalize_value($value));
      }
      foreach ($f->descendants("binary")) {
	my $value = $_->att("value");
	next unless ($value);
	push(@v,normalize_value($value));
      }
      ## print "Found values @v\n";
      $fs->{normalize_feature($k)} = [@v];
    }
  } elsif (defined $wf->att("tag")) {
    my $tag = $wf->att("tag");
##    $tag =~ s/\bpos\./cat./og;
##    print "MAF $tag\n";
    foreach my $feature (split(/\s+/,$tag)) {
      my ($f,@v) = split(/\./,$feature);
##      print "MAF TYPE $f '$type'\n";
##      @v = map( normalize_value( ($type && $type->value_name($_)) || $_), @v);
      @v = map( normalize_lingua_tagvalue($f,$_), @v);
##      print "MAF2 $f @v\n";
      $f = normalize_feature($f);
      $fs->{$f} = [ @v ];
    }
  } else {
    $fs = undef;
  }
  my $data = { lex     => $token,
	       truelex => $truetoken,
	       left    => $left,
	       right   => $right,
	       status  => 'keep',
	       entry   => $entry,
	       fs      => $fs
	     };
  $data->{cat} = join('|',@{$fs->{pos}}) if (defined $fs && exists $fs->{pos});
##  print Data::Dumper->Dump([$data],['data']);
  $dagmax = $right if ($right > $dagmax);
#  print "Ready to handle word '$entry' @tokens '$token' $left $right '$data->{cat}'\n";
  if (@fifo && $fifo[0]->{left} == $left) {
#    unshift @fifo;
    shift @fifo;
  }
  if (0 && $left == 0 && $token eq '_Uw') {
    ## should examine this case more carefully before re-activation
    push(@fifo,$data);
  } else {
    handle_word($data);
  }
}

sub build_elt {
  my $v = shift;
  if ($v eq '+') {
    return TAG::Plus->new;
  } elsif ($v eq '-') {
    return TAG::Minus->new;
  } else {
    return TAG::Val->new({text=>$v});
  }
}

sub fs_unify {
  my ($fs1,$fs2) = @_;
  ## print "TRY UNIFY\n";
  ## Warning: $fs1 and $fs2 are both fs but do not follow the same data model
  foreach my $k (keys %{$fs2}) {
    next if ($k =~ /pos/ || $k =~ /fct/ || $exclude_feature{$k});
    if (exists $fs1->{f}{$k}) {
      ##    print "CHECK KEY $k\n";
      my $v2 = $fs2->{$k};
      my $v1 = $fs1->{f}{$k};
      foreach my $v (@$v2) {
	my $vv = build_elt($v);
	## print "Check compare on $k:$v => $vv\n";
	##  print "\twith ".join(' ',map($_->text,@{$v1->{value}}))."\n";
	return 0 unless grep(Compare($vv,$_),@{$v1->{value}});
	## print "Check compare on $k:$v => OK\n";
      }
    } else {
      ##      $clone->();
      ## print "ADD KEY $k\n";
      $fs1->{f}{$k} = TAG::Comp->new( { name  => $k,
					rel   => 'eq',
					value => [ map( build_elt($_), @{$fs2->{$k}} ) ]
				      } );
    }
  }
  return 1;
}

1;

__END__

=head1 NAME

frmg_lexer - a small segmenter tool

=head1 SYNOPSIS

echo "il mange une pomme" | frmg_lexer

echo "il mange une pomme" | frmg_lexer [options]

where the options are

=over 4

=item -cache|c=F<file>

=item -comment

=item -complete=F<file>        [default F<complete.lex>]

=item -datadir|p=F<path>       [default F<<pkgdatadir>/>]

=item -dico                 [default dico.xlfg]

=item -dicodir|p=F<path>       [default F<<libdir>/lefff-frmg>]

=item -expand               [default 1]

=item -lexed|l=F<file>         [default F<<bindir>/lexed>]

=item -locutions=F<file>       [default F<dico.xlfg.cmpnd>]

=item -macros|m=F<file>        [default F<dico.xlfg.templates>]

=item -map=F<file>             [default F<frgram.map>]

=item -missing=F<file>         [default F<missing.lex>]

=item -prefixes=F<file>        [default F<dico.sxspell.pref>]

=item -restrictions=F<file>    [default F<restrictions.txt>]

=item -sxpipe               [default 1]

=item -sxpipecmd=F<file>       [default F<sxpipe>]

=item -tmpdir=F<path>          [default F</tmp>]

=item -utf8

=item -vcomp_expand         [default 1]

=back

=head1 DESCRIPTION

B<frmg_lexer> is a small segmenter tool (needs access to a lexed
lexicon). The name B<easy> refers to EASy, an evaluation campaign for
French parsers. This script is part of ALPAGE Linguistic Processing
Chain for French: L<http://alpage.inria.fr/alpc.en.html>.

B<frmg_lexer> can take a sentence or a DAG as input, Try:

    echo "il mange une pomme" | frmg_lexer
    cat sample_udag | frmg_lexer

To run frmg_lexer without the pre-processing chain sxpipe:

    echo "il mange une pomme" | frmg_lexer --nosxpipe

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004-2008, INRIA.

This program is free software; you can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Eric de la Clergerie <Eric.De_La_Clergerie@inria.fr>

=head1 SEE ALSO

=over 4

=item Evaluation campaign EASY L<http://www.limsi.fr/Recherche/CORVAL/easy/>

=item Alpage project team L<http://alpage.inria.fr>

=back

=cut
