#!perl

# version 20200116

# ---------------------------------
sub before() {
	$skip = 1 if m{<wps:txbx>};
	$skip = 0 if m{</wps:txbx>};

#	if ($level >= 2) {
#		next if m{<w:pStyle w:val="TOC[^"]+"/>};	# supress TOC
#	}

# 	Correct malfomrmed XML file
	s{>\s*<(?!/w:t>)}{><}gm;
	s{\s/>}{/>}gm;
	
	s{<w:r [^>]*>}{<w:r>}gm;	# flat runs
	s{<w:t [^>]*>}{<w:t>}gm;	# flat text
	
	s{&(lt|gt|amp);}{&$1#}gm;	# protect http://www.liquid-technologies.com/XML/EscapingData.aspx	

#	clean ...

	if ($level >= 0) {
		s{<w:lastRenderedPageBreak/>}{}g;
	}
	if ($level >= 1 ) {
		s{<w:snapToGrid\sw:val="0"/>}{}gm;
		s{<w:szCs[^>]+>}{}gm;				# suppress complex script size
 		s{<w:color\sw:val="000000"/>}{}gm;
		s{<w:color\sw:val="auto"/>}{}gm;
		s{<w:lang[^>]+>}{}gm;
		s{<w:noProof/>}{}g;
		s{<w:noProof/>}{}gm;
		s{<w:proofErr[^>]+>}{}gm;					# avoid proofing error definition		
		s{<w:rFonts\sw:eastAsia="[^"]+"/>}{}gm;

		s{<w:rFonts\sw:ascii="([^"]+)"\sw:hAnsi="[^"]+"\sw:cs="[^"]+">}{<w:rFonts w:ascii="$1" w:hAnsi="$1" w:cs="$1">}gm;

		s{<w:.Cs/>}{}gm;					# suppress Bold anf Italic complex script

		s{(<w:r( w:rsidR[^"]*"[^"]+")+>|<w:r>)}{<w:r>}gm;	# remove revision IDs

		s{<w:rStyle w:val="(hps|x\d+)"/>}{}gm;	# suppress referenced char styles "hps" or "x1" ... "x9999"

	}
	
	if ($level >= 2) {

		s{w:lang\sw:val="([^"]+)"\sw:eastAsia="[^"]+"\sw:bidi="[^"]+"}{w:lang w:val="$1" w:eastAsia="$1" w:bidi="$1"}g;

		s{<w:kern[^>]+/>}{}gm;
		s{<w:w\sw:val=".."/>}{}gm;
		s{<w:spacing[^>]+>}{}gm;
		s{<w:color[^>]+>}{}gm;					# avoid every color attribute
	}
	
	if ($level >= 3) {
		s{</?w:hyperlink[^>]*>}{}gm;			# suppress Hyperlinks (transform they to normal text)
		s{<w:bookmark[^>]+>}{}gm;				# suppress Bookmarks		
		s{<w:comment[^>]+>}{}gm;					# suppress Comments
		s{<w:rStyle w:val="CommentReference"/>}{}gm;
		s{</?w:smartTag[^>]*>}{}gm;					# suppress SmartTags
 	}
 	
	if ($level >= 4) {
		s{<w:strike/>}{}gm;					# suppress striked text	
		s{<w:dstrike/>}{}gm;				# suppress double striked text
		s{<w:vanish/>}{}gm;					# suppress hidden text
		s{<w:shadow/>}{}gm;					# suppress shadow text
		s{<w:color[^>]+>}{}gm;				# suppress every color in text
		s{<w:highlight[^>]+>}{}gm;			# suppress highlight
		s{<w:szCs w:val="24"/>}{}gm;		# suppress Font Size when 24 (=default?)
		s{<<w:sz w:val="24"/>}{}gm;			# suppress Font Size when 24 (=default?)
	}
	
	if ($level >= 5) {
		s{<w:rFonts[^>]+?/>}{}gm;				# avoid attributes for fonts
		s{<w:sz[^>]+?/>}{}gm;					# suppress Font Size
	}
	
	if ($level >= 6) {
		s{<w:rPr>(?:(?:(?!w:rStyle).)*?)</w:rPr>}{}g;		# remove everything is not rStyle
	}
	
	if ($level >= 7) {
		s{<w:r><w:rPr>(?:(?!</w:rPr>).)*</w:rPr>}{<w:r>}gm;		# remove every attribute inside run
	}
	
	if ($level >= 8) {
		s{<w:rPr>(?:(?!</w:rPr>).)*</w:rPr>}{}gm;		# remove every attribute inside paragraph
	}
	
	s{>\s*</w:t>}{> </w:t>}gm;								# add a space into a blank text


#	normalize ... <w:br/> and <w:tab/> have their own run
	s{(<w:r[^>]*>((?:(?!</?w:[tr]>).)*))(<w:tab/>|<w:br/>)((?:(?!<w:[tr]>).)*)}{<w:r>$3</w:r>$1$4}gm;

#	put a space before <w:br/> (soft return)
	s{(<w:r><w:br/></w:r>)}{<w:r><w:t> </w:t></w:r>$1}g;
	s{\s*(</w:t></w:r>\s*<w:r><w:br/></w:r>)}{ $1}gm;

	s{<w:rPr></w:rPr>}{}mg;					# suppress empty run properties
	s{<w:r></w:r>}{}gm;						# suppress empty runs

	s{<w:aliases[^>]+/>}{}gm;		# suppress style aliases
	
#	s{<w:r><w:tab/></w:r><w:r><w:rPr><w:rStyle w:val="FootnoteReference"/></w:rPr><w:footnoteRef/></w:r>}
#	  {<w:r><w:tab/></w:r>}g;
}
#
sub transform() {
	
	s{<w:noBreakHyphen/><w:t>}{<w:t>-}g;				# replace non breaking Hyphen with normal "-"
	s{<w:softHyphen/>}{}g;								# replace Soft Hyphen with nothing

	while ( s{(<w:r>(?:<w:rPr>(?:<w:[^>]+>)*</w:rPr>)?<w:t>)([^>]+)(</w:t></w:r>)\s*(\1)}{$1$2}) {};  # compress tags

	# next line moves footnote reference before [\s.!?;:]
	s{([.!?;:]\s*(</w:t></w:r>))\s*(<w:r><w:rPr><w:rStyle w:val="FootnoteReference"/>(<[^>]+?>)*</w:rPr><w:footnoteReference w:id="[^"]+?"/></w:r>)}
	  {$2$3<w:r><w:t>$1}g;

	# next line moves footnotes sourrounded by brackets [] {} () before [\s.!?;:]
	s{([\s;:!?.])([\[\(\{]</w:t></w:r>\s*<w:r><w:rPr><w:rStyle w:val="FootnoteReference"/>(?:<[^>]+?>)*</w:rPr><w:footnoteReference w:id="[^"]+?"/></w:r>\s*<w:r>.+?<w:t[^>]*>[\]\)\}])}
	  {$2$1}g;

#	while ( s{</w:t></w:r><w:r><w:t>([ ,.?!:"]+)</w:t>}{$1<\/w:t>}mx ) {};			# attach ([ ,.?!:"] to preceding run

}
# ---------------------------------
sub after() {

	s{<w:t>}{<w:t xml:space="preserve">}gm;
	s{(<w:document)}{\n$1}g;

	s{&(lt|gt|amp)#}{&$1;}g;  # unprotect http://www.liquid-technologies.com/XML/EscapingData.aspx	

	if ($beauty) {
		s{(<w:p |<w:p>|</w:p>)}{\n$1}g;
		s{(<w:r>|<w:r w[^>]+>)}{\n\t$1}g;
		s{(<w:body>)}{\n$1}g;
		s{(<.w:docu[^>]+>)}{\n$1}g;
		s{(<.w:body>)}{\n$1}g;
		s{(</w:tc>)(<w:tc>)}{\n$1\n$2}g;
		s{(</w:tc></w:tr>)}{\n$1\n}g;
		s{(<w:tbl>)}{\n$1}g;
	}
}

##########################################

$beauty = $ENV{BEAUTY};
$level = $ENV{LEVEL};
$file = $ARGV[0];

$/ = q{</w:p>};	# chunks are paragraphs
$/ = q{</w:style>} if $file =~ m{styles.xml}i; # into styles.xml chunks are Styles

open (IN, '<:encoding(utf-8)', $file);
open (OUT, ">:encoding(utf-8)", "$file.tmp");
while (<IN>) {
	before();
	transform();
	after();
	print OUT $_;
}
close IN;
close OUT;

unlink $file; rename "$file.tmp" => $file;
