'''
Copyright 2009 Milan Tofiloski, Julian Brooke
This file is part of SLSeg.

    Foobar is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    Foobar is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with SLSeg.  If not, see <http://www.gnu.org/licenses/>.
'''

import sys

'''
usage: python prep_as_input_to_charniak_parser.py infile outfile
'''

args = sys.argv

finput = open(args[1], "r")
foutput = open(args[2], "w")

sentences = finput.read()
sentences = sentences.replace("! I ", "!\nI ")
sentences = sentences.replace("? I ", "?\nI ")

list_of_sentences = sentences.split("\n")

for s_temp in list_of_sentences:
	# take care of embedded <PARAGRAPH> due to bad breaksent
	if s_temp.find("<PARAGRAPH>") != -1:
		sent_split_on_paragraph_breaks = s_temp.split("<PARAGRAPH>")
		for i in range(len(sent_split_on_paragraph_breaks)):
			if sent_split_on_paragraph_breaks[i] != "":
				s = sent_split_on_paragraph_breaks[i]
				tagged_sentence = "<s> " + s + " </s>\n"
				foutput.write(tagged_sentence)
			if i < len(sent_split_on_paragraph_breaks) - 1:
				paragraph_tag = "<s> <PARAGRAPH> </s>\n"
				foutput.write(paragraph_tag)
	else:
		tagged_sentence = "<s> " + s_temp + " </s>\n"
		foutput.write(tagged_sentence)

finput.close()
foutput.close()

