#!/usr/bin/perl

while (substr($ARGV[0], 0, 1) eq "-") {
    $option = substr($ARGV[0], 1);
    shift(@ARGV);
     
    if ($option eq "t") { $plaintext = 1;   }
    elsif ($option eq "m") { $missing_words_only = 1; }
    elsif ($option eq "s") { $suppress_line_numbers = 1; }
    elsif ($option eq "p") { $pagearg = shift(@ARGV); }
    elsif ($option eq "l") { $linearg = shift(@ARGV); }
    else {
        die "Invalid argument -$option.  Use tp -h for help.\n";
    }
}


open(INPUT_FILE, "cat /home/kurisuto/documents/linguistics/norse_class/inflected* |") || die "Fatal error:$!\n";
# open(INPUT_FILE, "cat /home/kurisuto/documents/linguistics/norse_class/oi_wordforms |") || die "Fatal error:$!\n";

while (<INPUT_FILE>) {

    chop;
    ($word, $pos, $def) = split(/\t/);

    $word =~ tr/A-Z/a-z/;


    if ($word ne "") {
	$wordhash{$word} .= "<entry><pos>" . $pos . "</pos><def>" . $def . "</def></entry>";
    }

}


open(INPUT_FILE, "< /home/kurisuto/documents/linguistics/norse_class/oigt_labeled_sentences") || die "Fatal error:$!\n";

$printing = 0;
if ($page_counter == "") {
    $thispage = 5;
    $thisline = 1;
}

while (<INPUT_FILE>) {
    
    chop;
    s/<LN>(.*)<\/LN>//;
    $line_id = $1;
    $line = $_;

    ($thispage, $thisline) = split(/\:/, $line_id);
    if ($thispage == $pagearg) {
	if ($thisline >= $linearg) {
	    $printing = 1;
	}
    }

    if ($printing == 1) {

	if ($suppress_line_numbers != 1) {
	    print "\n<lineid>", $line_id, "<\/lineid>\n\n";
	}
	
	@words = split (/\ /, $line);
	foreach $word (@words) {
	    
	    # Turn the entities into something else so that we don't
	    # mess up the semicolons.
	    $word =~ s/-acute;/-acute%/g;
	    $word =~ s/&OElig;/&OElig%/g;
	    $word =~ s/&oelig;/&oelig%/g;
	    $word =~ s/-long;/-long%/g;
	    $word =~ s/&o-hook;/&o-hook%/g;
	    $word =~ s/&O-hook;/&O-hook%/g;

	    $untampered_word = $word;
	    
	    # Strip the punctuation.
	    $word =~ s/^['"(]*//; # " ' )
	    $word =~ s/['"!?.,;:)]*$//; #  " '

#	    $word =~ s/(['"(!?.,':])/\n<punct>$1<\/punct>\n/g;
			
	    # Restore the entities
	    $word =~ s/%/;/g;
	    $untampered_word =~ s/%/;/g;
	    
	    $output = "<line><token>" . $untampered_word . "</token>";
	    
	    $word =~ tr/A-Z/a-z/;
	    
	    $output .=  $wordhash{$word};
	    $output .=  "</line>";
	    
	    if ($plaintext == 1) {
		$output =~ s/<\/entry><entry>/\n\t/g;
		$output =~ s/<entry>/\t/g;
		$output =~ s/<\/pos><def>/\t/g;
		$output =~ s/<.*?>//g;
#	    $output =~ 
	    }
	    
	    if (($missing_words_only == 0) || ($wordhash{$word} eq "")) {
		print $output;
		print "\n";
	    }
	    
	}
    }
    
}

