#!perl

# version 20170914

# ---------------------------------
sub before() {
	$skip = 1 if m{<wps:txbx>};
	$skip = 0 if m{</wps:txbx>};

#	if ($level >= 2) {
#		next if m{<w:pStyle w:val="TOC[^"]+"/>};	# supress TOC
#	}

	s{\n+<}{<}g;	# flatten file ...
	s{\t+<}{<}g;	# ...
	
	s{<w:r [^>]*>}{<w:r>}g;	# flat runs
	s{<w:t [^>]*>}{<w:t>}g;	# flat text
	
	s{&(lt|gt|amp);}{&$1#}gm;	# protect http://www.liquid-technologies.com/XML/EscapingData.aspx	

#	clean ...
#	http://www.datypic.com/sc/ooxml/e-w_rPr-2.html

	if ($level >= 0) {
		s{<w:lastRenderedPageBreak/>}{}g;
	}
	if ($level >= 1 ) {
		# _clean suppress only inside "run"
		_clean('<w:snapToGrid w:val="0"/>');
		_clean('<w:szCs w:val="24"/>');					# suppress Font Size when 24 (=default?)
		_clean('<w:sz w:val="24"/>');					# suppress Font Size when 24 (=default?)
		_clean('<w:kern w:val="\d+"/>');				# suppress kerning attribute
		_clean('<w:color w:val="000000"/>');				# suppress color when black (or default)
		_clean('<w:color w:val="auto"/>');				# suppress color when "auto"	

		_clean('<w:u w:color="000000"/>');				# suppress color when black (or default)
		_clean('<w:u w:color="auto"/>');				# suppress color when "auto"

		_clean('<w:bdr w:val="nil"/>');

		_clean('<w:w w:val=".."/>');					# suppress compressed or expanded text
#		_clean('<w:rFonts w:eastAsia="[^>]+"/>');			# avoid asian attribute for fonts
		_clean('<w:spacing w:val="[^"]+"/>');				# suppress space spacing

		s{<[^<]+?w:val="nil"/>}{}gms;

		s{<w:lang[^>]*?>}{}g;
		s{<w:noProof/>}{}g;
		s{<w:proofErr w:type="\w+?"/>}{}g;			# avoid proofing error definition		
		s{ w:rsidR="[^"]+"}{}g;						# remove revision IDs
		s{ w:rsidRPr="[^"]+"}{}g;					#
		s{ w:rsidP="[^"]+"}{}g;						#
		s{></w:t>}{> </w:t>}g;						# add a space into a blank text

		s{<w:rStyle w:val="(hps|x\d+)"/>}{}gm;			# suppress referenced char styles "hps" or "x1" ... "x9999"
		s{<w:bookmark(Start|End)[^<>]+?/>}{}gm;

	}
	
	if ($level >= 2) {
#		s{<w:rFonts[^<>]+?/>}{}g;	# suppress Font attribute
		
		s{ w:eastAsia="[^>]+"}{}g;
		s{ w:cs="[^>]+"}{}g;
		s{<w:szCs[^<>]+>}{}g;
		s{<w:bCs/>}{}g;
		s{<w:iCs/>}{}g;
		
		s{<w:noBreakHyphen/>}{<w:t>-</w:t>}g;		# replace non breaking Hyphen with normal "-"
		s{<w:softHyphen/>}{}g;						# replace Soft Hyphen with nothing
		s{<w:smartTag[^>]+>}{}gm;					# suppress SmartTags
		s{</w:smartTag>}{}gm;
	}

	if ($level >= 3) {
		s{<w:comment[^<>]+>}{}gm;						# suppress Comments
		s{<w:rStyle w:val="CommentReference"/>}{}gm;
	}
	if ($level >= 4) {
		s{<w:hyperlink[^<>]+>}{}gm;						# suppress Hyperlinks (transform they to normal text)
		s{</w:hyperlink>}{}gm;
	}
	if ($level >= 5) {
		_clean('<w:vanish/>');							# suppress hidden text
		_clean('<w:shadow/>');							# suppress shadow text
		_clean('<w:color[^<>]+>');							# suppress every color in text
		_clean('<w:highlight w:val="[^"]+?"/>');				# suppress highlight
	}
	if ($level >= 6) {
		_clean('<w:rFonts[^<>]+?/>');					# avoid attribute for fonts
		_clean('<w:sz[^<>]+?/>');						# suppress Font Size
	}
	if ($level >= 7) {
		s{<w:rPr>(?:(?:(?!w:rStyle).)*?)</w:rPr>}{}g;		# remove everything is not rStyle
#		s{<w:r>(?:(?:(?!<w:t>).)*?)<w:t>}{<w:r><w:t>}g;
#		s{</w:r>(?:(?:(?!<w:r>).)*?)<w:r>}{</w:r><w:r>}g
	}
	if ($level >= 8) {
		s{<w:rPr>(?:(?:(?!</w:rPr>).)*?)</w:rPr>}{}g;	# remove every attribute
	}

	s{<w:rPr></w:rPr>}{}g;								# suppress empty run properties
	s{<w:r></w:r>}{}gm;								# supress empty runs

#	normalize ... <w:br/> and <w:tab/> have their own run
#	s{(<w:r>(?:(?!<w:t>).)*)(<w:br/>|<w:tab/>)<w:t>}{$1$2</w:r>$1<w:t>}g if $skip == 0;

	s{<w:r><w:tab/></w:r><w:r><w:rPr><w:rStyle w:val="FootnoteReference"/></w:rPr><w:footnoteRef/></w:r>}
	  {<w:r><w:tab/></w:r>}g;
}
#
sub transform() {
#	s{</w:t></w:r><w:r><w:noBreakHyphen/></w:r>}{-</w:t></w:r>}g;
	
	s{<w:noBreakHyphen/><w:t>}{<w:t>-}g;				# replace non breaking Hyphen with normal "-"
	s{<w:softHyphen/>}{}g;								# replace Soft Hyphen with nothing

	while ( s{(<w:r>(?:<w:rPr>(?:<w:[^>]+>)*</w:rPr>)?<w:t>)([^>]+)(</w:t></w:r>)\s*(\1)}{$1$2}) {};  # compress tags

	# next line moves footnote reference before [\s.!?;:]
	s{([.!?;:]\s*(</w:t></w:r>))\s*(<w:r><w:rPr><w:rStyle w:val="FootnoteReference"/>(<[^>]+?>)*</w:rPr><w:footnoteReference w:id="[^"]+?"/></w:r>)}
	  {$2$3<w:r><w:t>$1}g;

	# next line moves footnotes sourrounded by brackets [] {} () before [\s.!?;:]
	s{([\s;:!?.])([\[\(\{]</w:t></w:r>\s*<w:r><w:rPr><w:rStyle w:val="FootnoteReference"/>(?:<[^>]+?>)*</w:rPr><w:footnoteReference w:id="[^"]+?"/></w:r>\s*<w:r>.+?<w:t[^>]*>[\]\)\}])}
	  {$2$1}g;

#	while ( s{</w:t></w:r><w:r><w:t>([ ,.?!:"]+)</w:t>}{$1<\/w:t>}mx ) {};			# attach ([ ,.?!:"] to preceding run

}
# ---------------------------------
sub after() {

	s{<w:t>}{<w:t xml:space="preserve">}g;
	s{(<w:document)}{\n$1}g;

	s{&(lt|gt|amp)#}{&$1;}g;  # unprotect http://www.liquid-technologies.com/XML/EscapingData.aspx	

	if ($beauty) {
		s{(<w:p |<w:p>|</w:p>)}{\n$1}g;
		s{(<w:r>|<w:r w[^<>]+?>)}{\n\t$1}g;
		s{(<w:body>)}{\n$1}g;
		s{(<.w:docu[^<>]+?>)}{\n$1}g;
		s{(<.w:body>)}{\n$1}g;
		s{</w:tc><w:tc>}{\n</w:tc>\n<w:tc>}g;
		s{(</w:tc></w:tr>)}{\n$1\n}g;
		s{(<w:tbl>)}{\n$1}g;
	}
}

sub _clean() {
#	clean only inside a "run"
	$pat = $_[0];
	s{(<w:r><w:rPr>.*?)$pat(.*?</w:rPr>)}{$1$2}gm;
}

##########################################

$beauty = $ENV{BEAUTY};
$level = $ENV{LEVEL};
$file = $ARGV[0];

$/ = q{</w:p>};	# chunks are paragraphs

open (IN, '<:encoding(utf-8)', $file);
open (OUT, ">:encoding(utf-8)", "$file.tmp");
while (<IN>) {
	before();
	transform();
	after();
	print OUT $_;
}
close IN;
close OUT;

unlink $file; rename "$file.tmp" => $file;
