#!/usr/bin/perl # Fix up html files after they've been extracted from .lit books or # .doc files. # Strip out all the formatting so that what's left is plain-jane html. $goout = ""; $lastline = ""; while ($line = <>) { chop $line; $line = $lastline . $line; # $a = chr(128); # $b = chr(255); # if ($line =~ /^.*([$a-$b]).*$/) { # print STDERR "here: $1, " . ord($1) . ", $line\n"; # } $line = replace_pattern ($line,"<[pP][^>]*>","

") if ($line =~ /<[pP]/); $line = replace_pattern ($line,"]*>.*<\/style[^>]*>","") if ($line =~ /^.*]*>","") if ($line =~ /^.*]*>","") if ($line =~ /<\/span/); $line = replace_pattern ($line,"<\/div[^>]*>","") if ($line =~ /<\/div/); $line = replace_pattern ($line,"]*>","") if ($line =~ /

]*>","") if ($line =~ ///g; $line =~ s/<\/o:p>//g; $line =~ s/\"/\"/g; $line =~ s/\̶[012];/\"/g; $line =~ s/\̵[67];/\'/g; $line =~ s/\—/ -- /g; $line =~ s/\…/.../g; $line =~ s/\‑/-/g; #$line =~ s/\™/trademark/g; #$line =~ s/\®/registered trademark/g; #$line =~ s/\©/copyright/g; $line =~ s/\ /\ /g; $line =~ s/\­//g; $line =~ s/\r//g; undef $lastline; if ($line =~ /^<[pP]>$/) { $lastline = $line; next; } print "$line\n"; } print "$lastline\n" if $lastline; exit 0; sub replace_pattern { my ($line,$strip,$replace) = @_; my $fetch = 1; while (!($line =~ /^.*$strip.*$/) && $fetch) { $fetch = <>; $line .= " " . $fetch; chomp $line; } $line =~ s/$strip/$replace/g; return $line; }