#! /usr/bin/perl

use strict;
#use warnings 'all';

use HTML::TreeBuilder 5 -weak; # Ensure weak references in use
use IO::HTML qw(html_file_and_encoding html_outfile);
use File::Copy;
use Cwd;
use Getopt::Long qw(:config no_ignore_case bundling);
use Data::Dumper;
use utf8;

$Data::Dumper::Maxdepth = 1;
$|=1;   # autoflush
my $col_err="";
my $col_info="";
my $col_off="";
my $csr_up="";
my $clr_line="";
if (-t STDOUT) {
	$col_err="\033[1;31m";   # light red
	$col_info="\033[1;33m";    # light yellow
	$col_off="\033[0m";
	$csr_up="\033[1A";	# cursor up one line
	$clr_line="\033[K";	# clear to end of line
}

my $absentfilename='';	# write a file with absents if ne''
my $dictpath="/usr/local/etc";
my $backup='';
my $compress='';    # compress defaults to false
my $discard_comments='';
my $dryrun='';	# do not write output file if true
my $followlinks='';
my $lang='de';	# tags in other languages are ignored
my $verbose='';
my $inplace=''; # replace (overwrite) input file
my $keepshy=''; # keep existing &shy; in source
my $hyphenate='1';  # apply hyphenation
my $tags="a b del div em i ins li mark p q small span strong u td th";
my $printabsents='6';
my $recursive='';
my $quiet='';

my $time_start=time;
binmode(STDOUT, ":utf8");
$dictpath=$ENV{'HYPHENATEHTML_DICT_PATH'} if $ENV{'HYPHENATEHTML_DICT_PATH'};
$tags=$ENV{'HYPHENATEHTML_TAGS'} if $ENV{'HYPHENATEHTML_TAGS'};
GetOptions(
	'absents|a=i' => \$printabsents,
	'backup|b' => \$backup,
	'compress|c' => \$compress,
	'dict|d=s' => \&optionHandler,
	'discard-comments|C' => \$discard_comments,
	'dryrun|n' => \$dryrun,
	'help|h' => sub { help() },
	'hyphenate!' => \$hyphenate,
	'inplace|I' => \$inplace,
	'keepshy|k' => \$keepshy,
	'lang|L=s' => \$lang,
	'links|l' => \$followlinks,
	'tags|t=s' => \$tags,
	'quiet|q' => \$quiet,
	'recursive|r' => \$recursive,
	'verbose|v+' => \$verbose,
	'writeabsents|w=s' => \$absentfilename,
) or usage();
if ($backup) { $inplace='1'; }   # backup implies inplace
if ($quiet) { $printabsents=0; $verbose=0; }
if ($ENV{'HYPHENATEHTML_DICT'}) {
	my @d=split(' ',$ENV{'HYPHENATEHTML_DICT'});
	foreach my $f (@d) {
		readDictSimple ($f);
		}
	}
my $time_wordlists=time;

$tags =~ s/^\s+|\s+$//g;
my @tags_a=split(' ',$tags);
#print $ENV{'hyphenateHTMLdictdir'} if $ENV{'hyphenateHTMLdictdir'};

my %words; # the dictionary
my %absents=(); # words not in the dictionary
my %absentpath=(); # last path for absent words
my $current_file="";	# needed for absents
my $wordstotal=0;
my $fileswritten=0;

my $level=0;	# for debug indention

# for all files in argument list
foreach my $htmlfilename (@ARGV) {
	doFile($htmlfilename);
	}
if ($printabsents) {    # print words not in dictionary
	printAbsents();
	}
my $time_done=time;

if ($verbose) {
	printf ("time for reading wordlists: %ds\n",$time_wordlists-$time_start);
	printf ("time for hyphenating: %ds\n",$time_done-$time_wordlists);
	print "$wordstotal words hyphenated.\n";
	print "$fileswritten files written.\n";
	}
exit;   # we're done

#######################################################

sub oneElement
{
my $indent="    "x$level;
$wordstotal++;
my $element=shift;
my $lang_current=shift;
if ($verbose>2) { print "$indent*** oneElement lang=$lang_current ***\n" };
my $x=$element;
my $s=$element->content();
my $tag=$element->tag();
my $plang=$element->attr('lang');
if ($verbose>2) { printf "$indent  $tag defined_lang:$plang\n" };
my @elements=$element->content_list();
foreach my $el (@elements) {
	next if ref($el) ne "HTML::Element";
	my $tag=$el->tag();
	my $plang=$el->attr('lang');
	my $lang_element=($plang?$plang:$lang_current);
	if ($verbose>2) { print "${indent}child $tag lang:$lang_element\n" };
	if (grep ( /^$tag$/, @tags_a )) {
		if ($verbose>2) { print "$indent--Element $tag lang:$lang_element\n" };
		my @c=$el->content();
		my $l=$el->content();
		next if !$l;
		$l=@$l;
		if (!rindex $lang_element, $lang, 0) {
			for(my $i=0;$i<$l;$i++) {
				if(!ref($el->content->[$i])) {
					$el->content->[$i]=hyphenateString($el->content->[$i]);
					if ($verbose>2) { print "${indent}content:",$el->content->[$i],"\n" };
					}
				}
			}
		}
	$level++;
	oneElement($el,$lang_element);
	$level--;
	}
}

# scans one file name
# the filename may be a pattern including wildcards
# this routine calls itself recursively if selected
sub doFile
{
my $filePat=shift;
my @files=glob($filePat);	# expand wildcards
foreach my $file (@files) {
	next if !-f $file;
	oneFile($file);
	}
if ($recursive) {
opendir(dh,'.');
my @entries=readdir(dh);
foreach my $entry (@entries) {
#		print "readdir: $entry\n";
	next if $entry =~ m/^\./;
	next if -l $entry;
	next if !-d $entry;	# directories only
	chdir $entry || die "cannot chdir to $entry. Error $!";
	print "${col_info}entered directory ",cwd(),"${col_off}\n" if $verbose>1;
	doFile($filePat);
	chdir '..';
		}
	}
closedir(dh);
}

# process one file
# the file must exist and the filename must not contain wildcards
sub oneFile
{
my $htmlfilename=shift;
$current_file=cwd() . '/' . $htmlfilename;
if ((-l $htmlfilename) && !$followlinks) {
	if ($verbose) { printf "${col_info}skipping symbolic link $htmlfilename$col_off\n"; }
	return;
	}
if (-d $htmlfilename) {
	if ($verbose) { printf "${col_info}skipping directory $htmlfilename$col_off\n"; }
	return;
	}
my $filemode=(stat($htmlfilename))[2];    # keep original file permissions for later
my $outfile="$htmlfilename.hyp.html";
if ($inplace) { $outfile=$htmlfilename };
my $tree = HTML::TreeBuilder->new; # empty tree
# set some options
$tree->store_comments($discard_comments?0:1);
$tree->no_space_compacting($compress?0:1);
$tree->ignore_ignorable_whitespace($compress?1:0);
$tree->no_expand_entities(1);
$tree->ignore_unknown(0);	# important! TreeBulder will discard <math> otherwise!
#	$tree->implicit_tags(0);
# read and parse file
if ($verbose) { print "reading HTML file ",cwd(),"/$htmlfilename\n"; }
(my $fh, my $encoding, my $bom)=html_file_and_encoding($htmlfilename);
if ($verbose>1) { printf "filemode: %04o, encoding: $encoding\n",$filemode & 07777; }
$tree->parse_file($fh) || die "${col_err}cannot parse: $!${col_off}";
close ($fh);

my ($html)=$tree->look_down('_tag','html');
my $plang=$html->attr('lang');
if (!$plang) {
	print "$csr_up$clr_line${col_err}*** Skipping page without language. file=",cwd(),"/$htmlfilename ${col_off}\n";
	return;
	}
if (rindex $plang, $lang, 0) {
	print "$csr_up$clr_line${col_err}*** Skipping page <html lang=$plang> file=",cwd(),"/$htmlfilename ${col_off}\n";
	return;
	}
# look at all words we want to hyphenate

oneElement($html,$plang);

if ($verbose) { print "total: $wordstotal words\n"; }
if ($wordstotal==0) {
	print "${col_info}did not see any words. skipping \"$htmlfilename\".${col_off}\n";
	return;
	}
# write processed tree
if ($backup) {
	if ($verbose) { print "creating backup\n"; }
	move($htmlfilename,"$htmlfilename.bak");
}
if (!$dryrun) {
	if ($verbose) { print "writing target file $outfile\n"; }
	my $outfh=html_outfile("$outfile",$encoding, $bom);
	print $outfh $tree->as_HTML(undef,undef,{});
	close($outfh);
	chmod($filemode,$outfile) || die "${col_err}cannot chmod $!${col_off}";
	$fileswritten++;
	}
}

sub optionHandler
{
	my ($opt_name, $opt_value) = @_;
#	print("Option name is $opt_name and value is $opt_value\n");
	if ($opt_name eq "dict") {
		readDictSimple($opt_value);
	}
}

# reads one dictionary file
sub readDictSimple
{
my $dictfile=shift;
$dictfile = "$dictpath/$dictfile" unless $dictfile =~ m/\//;
open (FH, '<:utf8', "$dictfile") or die "${col_err}cannot read dictionary $dictfile - $!${col_off}";
if (!$hyphenate) { print "Hyphenation not requested - skipping dictionary $dictfile\n"; return; }
if ($verbose) { print "Reading dictionary $dictfile - "; }
while(<FH>){
	my $word= $_;
	$word =~ s/^\s+|\s+$//g;    # trim whitespaces
	$word =~ s/\s+\#.+//;    # remove comment
	if ($word =~ m/;/) {    # seems to be dante-format
		my @s=split(/;/,$word);
#            print "s:   .$s[0]. .$s[1]. .$s[2]. .$s[3]. .$s[4].","\n";
		$word=($s[1] eq "-2-"? $s[3]:$s[1]);
		next if ($word eq "-4-");
#            print "w:   $word\n";
		$word =~ s/.+?;//;   # delete original word (until the first ';') in dante-format
		$word =~ m/;/ && next;  # skip spezial cases (swizzerland and old german)
		$word =~ s/\-\.|<\.|·//g; # remove unwantet hyphens
		$word =~ s/\.//g;   # remove unwantet hyphens
		$word =~ s/=+|-|<|>/\|/g;
		}
	pushWord ($word);
	}
close(FH);
if ($verbose) { my $nwords=keys %words; print "${col_info}$nwords words${col_off}\n"; }
}

# Expects a line of the dictionary with '|' as soft hyphens.
# The line may contain more than one word, separated by spaces.
# Creates the raw word(s) and the html version and stores it in @words.
sub pushWord
{
my @wl=split('\s',shift);
foreach my $w (@wl) {
#		print "push word $w\n";
	my $html= $w;
	$w =~ s/\|//g;	# deleting '|' gives the bare word
	$html =~ s/\|/\&shy;/g;	# replacing '|' with &shy; gives the HTML version
	if($w ne "") {
		$words{$w}=$html;
		if ($verbose>2) {
			print ":$w:\t:$html:\n";
			}
		}
	}
}

# returns a hyphenated string
sub hyphenateString
{
my $s=shift;
#	print "  s before: $s\n";
if (!$keepshy) {
	$s =~ s/&shy;//g;   # remove existing &shy;
	}
# create a second string and eliminate entities to prevent them from hyphenation
my $s2=$s;
$s2=~ s/&#8239;|&#x202f;|&nbsp;|&times;|&lt;|&gt;/ /g;	# replace spacey entities with spaces
$s2=~ s/&.*?;//g;	# delete &.*; non greedy
my @wds=split('\W+',$s2);	# split words
#	print "$s\n$s2\n",join(':',@wds),"\n";
if ($verbose>2) { print "."; }
if ($hyphenate) {
	foreach my $k (@wds) {	# for each word in $s2
		my $hyp=$words{$k};
		if ($hyp) {
			$s =~ s/\b$k\b/$words{$k}/g;	# replace it by hyphenated version
		} else {	# word is not in dictionary
			if ($printabsents) {    # print words not in dictionary
				if (length($k)>$printabsents) {
					$absents{$k}++;
					$absentpath{$k}=$current_file;
					}
				}
			}
		}
	}
#	print "  s  after: $s\n";
return($s);
}

sub printAbsents
{
if ((keys %absents)>0) { # if there are absents
	print "\n${col_info}*** words not in dictionary with length >$printabsents ***${col_off}\n";
	foreach my $w (sort keys(%absents)) {
		print $w, " :", $absents{$w}, " ", $absentpath{$w}, "\n";
		}
	if ($absentfilename ne '') {
		print "write $absentfilename\n";
		open (FH, '>', $absentfilename) or die "cannot open absentsfile for write - $!";
		foreach my $w (sort keys(%absents)) {
			print FH "$w\n";
		}
		close FH;
		}
	}
}


# prints manual and exits
sub help()
{
my $myname=$0;
$myname =~ s#^.*/##;    # remove path
usage();
print << "EOF";

  <HTML_file_name>: the name of the HTML file you want to hyphenate.
    The encoding of the file is 'sniffed' by TreeBuilder so any supported
    encoding should work fine.
    The output filename is suffixed with '.hyp.html' unless the --inplace
    or --backup option is used.
  --absents <n>: print words not in dictionary with length > n
    n defaults to $printabsents. Set to 0 to turn it off.
    This is highly useful to find pages with wrong <html lang..>
    or long passages with different language without a <span lang=..> or so!
  --backup: instead of generating a new file, it uses the name of the
    input file. The original file is renamed to *.bak. An already existing
    .bak file will be deleted.
    This option implies --inplace.
  --compress: if you use this option you will basically get all HTML in one
    line. This will save a little(!) space and may be useful for obfuscation.
    Caution: if you delete or overwrite your original source, this can not be
    undone!
  --dict <file>: sets the name of the hyphenation dictionary file.
    The file must contain one word in a line. Possible hyphens can be set by
    a '|' like in 'Tren|nungs|vor|schlag'. This is the same notation the
    german 'Duden' uses so you can copy and paste from there.
    The dictionary has to be in UTF-8 format.
    You may sort the dictionary for easier reading but the order of the words
    does not matter.
    The --dict option may be specified more than once if you have several
    dictionary files.
    The file is read as soon as the option is parsed, i.e. from left to right
    and options that prevent reading dictionaries must be first to become
    effective.
  --discard-comments: remove HTML comments (<!-- ... -->).
    Useful for obfuscation. Do not use if you use Server Side Includes (SSI)
    since they would be removed.
  --hyphenate: this is the default, apply hyphenation. Can be used as
    --nohyphenate to remove existing hyphenation.
  --inplace: replace (overwrite) the input file (dangerous!)
    Use instead of --backup if you plan to delete the backup file anyway.
  --keepshy: normally, existing soft hyphens are removed prior to hyphenation.
    This helps actualizing the document when new words are added to the
    dictionary since words with &shy; will not be touched even if a finer
    hyphenation would be available.
    This option keeps existing &shy; before applying the dictionary.
  --lang: the selected language. This is used to determine which tags
    to hyphenate. No hyphenation will be applied to tags or pages
    with a lang-attribute other than given.
    Pages without specified language (i.e. no <html lang=>) will not be
    hyphenated at all.
    Defaults to "$lang".
    Note: this does not influence the selection of dictionaries but you
    may insert a \$lang anywhere in a dictionary name, e.g. dict.\$lang.txt.
  --links: follow symbolic links.
    normally only regular files are hyphenated. this option changes this
    behavior.
  --quiet: print nothing. This option overrides --verbose and --absents.
  --recursive: descent recursively through directories.
    To prevent shell expansion, enclose wildcard file names in apostrophes.
  --tags "tag ...": Sets the tags hyphenation will be applied to. It defaults
    to "$tags"
    that means most readable text
    except <sub>, <sup> and headings.
    Note: tags not in the list will not be touched; neither any &shy; will be
    inserted nor will existing &shy; be removed.
  --verbose: print what's going on.
    Add more than once (-vvv) to increase verbosity.
  --writeabsents: write a file containing words not in dictionary according
    to the length specified with --absents. If there are no words, the file
    is not written at all.

Environment Variables:
    $myname uses the following environment variables to allow keeping the
    command lines short:
  HYPHENATEHTML_DICT_PATH
    specifies the directory where your dictionaries live. You may use
    \$lang to select between various languages, e.g.
    /usr/local/etc/dict_\$lang
  HYPHENATEHTML_DICT
    contains a space separated list of dictionaries. If you specify the
    --dict option on the command line is given, additional files will
    be loaded before.
  HYPHENATEHTML_TAGS
    space separated list of tags to hyphenate.
    overwrites the default tags if set.

The order in which dictionary files are specified may influence the result
since words in dictionaries loaded later may overwrite words in earlier ones.
Dictionaries in HYPHENATEHTML_DICT are always loaded after the ones specified
with --dict.
Note: this script uses HTML::TreeBuilder for reading and writing HTML.
Although it tries to keep whitespaces, they may slightly change but
this will not impact the appearance in the browser.
We also use strict dictionary based hyphenation (no rules) so words
not in any of the dictionary files will not be hyphenated.

EOF
exit;
}

# prints usage and exits
sub usage()
{
my $myname=$0;
$myname =~ s#^.*/##;    # remove path
print << "EOF";

Hyphenate HTML files.

    usage: $myname [options] <HTML_file_name>

Options:
  -a, --absents <n>        : print words not in directory with length > n
  -b, --backup             : keep a backup of the original file
  -c, --compress           : remove ignorable whitespaces
  -d, --dict <file>        : name of the hyphenation dictionary file
  -C, --discard-comments   : remove comments (<!-- -->)
  -h, --help               : this help text
  --hyphenate              : (default) apply hyphenation
                             can be used as --nohyphenate to only remove &shy;
  -I, --inplace            : replace (overwrite) the input file
  -k, --keepshy            : keep existing &shy; in source file
  -L, --lang               : language (like in <html lang="..">)
  -l, --links              : follow symbolic links
  -n, --dryrun             : do not write output file ("do nothing")
  -q, --quiet              : print nothing
  -r, --recursive          : recursive search
  -t, --tags "tag ..."     : space separated list of tags to hyphenate
  -v, --verbose            : print about what's done
  -w, --writeabsents <file>: write words not in dictionary to file
EOF
}
