'''
Copyright 2009 Milan Tofiloski, Julian Brooke
This file is part of SLSeg.

    Foobar is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    Foobar is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with SLSeg.  If not, see <http://www.gnu.org/licenses/>.
'''

# uses nltk 0.9.5
import nltk, sys

break_rule_list = ["BREAKrule1/HH", "BREAKrule2/HH", "BREAKrule3/HH", "BREAKrule4/HH", "BREAKrule5/HH", "BREAKrule6/HH", "BREAKrule7/HH", "BREAKrule8/HH", "BREAKrule9/HH", "BREAKrule10/HH", "BREAKrule11/HH", "BREAKrule12/HH", "BREAKrule13/HH", "BREAKrule14/HH", "BREAKrule15/HH", "BREAKrule16/HH", "BREAKrule17/HH", "BREAKrule18/HH", "surfaceRule1/HH", "surfaceRule2/HH", "surfaceRule3/HH"]

'''
this is to handle expressions that are of type "so much so that", "to the extent"

surface_break_list description of data structure:
1st element is the key word which we match in the sentence before continuing with the other words (should be rarest word so as to not keep calling the function's inner loops) i.e. ""; index = N in test_surface_rule()
2nd element is anything that MUST appear directly before the keyword (can take the form of words or tags) i.e. [["so"], ["AUX"], ["that/IN"]], list of lists containing possibilities for words/tags at N-1, N-2 etc.
3rd element is what MUST directly appear after our keyword (again could be either words or tags), list of lists (representing possibilities) for N+1, N+2, etc.
4th element is any word/tag that cannot appear immediately before required words/tags; 5th element is any word that can't appear after
6th element indicates location of the inserted break, 0 is the position before the key word, 1 is position after keyword, -1 is two positions before keyword , etc.
''' 

def get_phrasal_discourse_cues(filename):
	cuefile = open(filename);
	discourse_cues = []
	for line in cuefile:
		if line.strip() and line[0] != '#':
			words = line.strip().split("_")
			index = 0;
			for i in range(len(words)):
				if words[i].startswith('@'):
					index = i
			before = words[:index]
			before.reverse()
			after = words[index+1:]
			discourse_cue = [words[index][1:],[],[]]
			for word in before:
				discourse_cue[1].append(word.split('|'))
			for word in after:
				discourse_cue[2].append(word.split('|'))
			discourse_cues.append(discourse_cue)
	return discourse_cues

def get_surface_break_rules(filename):
	cuefile = open(filename);
	surface_rules = []
	for line in cuefile:
		if line.strip() and line[0] != '#':
			words = line.strip().split("_")
			index = 0;
			for i in range(len(words)):
				if '@' in words[i]:
					index = i
			if words[0].startswith('^'):
				not_before = words[0][1:].split('|')
				before = words[1:index]
			else:
				not_before = []
				before = words[:index]
			if words[-1].startswith('^'):
				not_after = words[-1][1:].split('|')
				after = words[index+1:-1]
			else:
				not_after = []
				after = words[index+1:]
			
			before.reverse()
			surface_rule = [words[index].split('@')[1],[],[], not_before, not_after, int(words[index].split('@')[0])]
			for word in before:
				surface_rule[1].append(word.split('|'))
			for word in after:
				surface_rule[2].append(word.split('|'))
			surface_rules.append(surface_rule)
	return surface_rules

phrasal_discourse_cues = get_phrasal_discourse_cues("phrasaldiscoursecues.txt")
surface_break_list = get_surface_break_rules("surfacebreakrules.txt")

#surface_break_list = [["instead", [], [["of"], ["VBG"]], [], [], 0], ["much", [["so"]], [["so"], ["that"]], [], [], -1], ["although", [], [], [], [], 0], ["though", [["even"]], [],[],[], -1], ["then",[[","]],[],[],[],0], [["then"], [["so", "and", "but"]],[],[],[], -1], ["because", [["RB"]],[],["AUX", "AUXG"],["of"],-1], ["because", [],[],["AUX", "AUXG", "RB"],["of"],0], ["by", [], [["RB"],["VBG"]],[],[], 0], ["by", [], [["VBG"]],[],[], 0], ["so", [], [["that/IN"]],[],[],0], ["so", [[","]],[],[],["JJ"],0], ["order",[["in"]],[["to","that"]],[],[],-1], ["long", [["as"]], [["as"]], ["AUX", "AUXG"], [], -1], ["provided", [], [["that"]], [], [], 0], ["soon", [["as"]], [["as"]], ["AUX", "AUXG"], [], -1], ["time", [["the"], ["by"]], [], ["AUX", "AUXG"], [], -2], ["considering", [], [["that"]], ["AUX", "AUXG"], [], 0], ["despite", [], [["the"], ["fact"] , ["that"]], [], [], 0], ["even", [], [["if"]], [], [], 0], ["except", [], [["when"]], [], [], 0], ["hope", ["the", "in"], [["that"]], [], [], -2], ["insofar", [], [["as"]], [], [], 0], ["condition", [["the"], ["on"]], [["that"]], [], [], -2], ["grounds", [["the"], ["on"]], [["that"]], [], [], -2], ["only", [], [["when"]], [], [], 0], ["extent", [["the"], ["to"]], [["that"]], [], [], -2], ["whether", [], [["or"], ["not"]], [], [], 0], ["given", [], [["that"]], ["AUX", "AUXG"], [], 0]]

#phrasal_discourse_cues = [["he", [], [["said"]]], ["she", [], [["said"]]], ["we", [], [["said"]]], ["they", [], [["said"]]], ["i", [], [["said"]]], ["you", [], [["said"]]], ["thanks", [], []], ["thank", [], [["you"]]], ["thank", [], [["you"], ["very"], ["much"]]], ["thank", [], [["goodness"]]], ["thank", [], [["god"]]], ["all", [], [["things"], ["considered"]]], ["admit", [["to"], ["have"], ["i"]], []], ["turns", [["it"],["as"]], [["out"]]], ["turns", [], ["out"]], ["ask", [["you"], ["if"]], [["me"]]], ["one", [], [["thing"], ["'s", "is"], ["for"], ["sure", "certain"]]], ["guess", [["i"]], []], ["as", [], [["it"], ["is", "stands", "looks"]]], ["know", [["you"]], []], ["think", [["i"]], []], ["wonder", [["i"]], []], ["bet", [["i"]], []], ["mean", [["i"]], []], ["believe", [["i"]], []], ["that", [], [["is"]]], ["make", [], [["no"], ["mistake"]]], ["mind", [], [["you"]]], ["admit", [["i"], ["'ll", "will"]], []], ["will", [["you"], ["if"]], []], ["see", [["you"]], []], ["having", [], [["said"], ["that"]]], ["said", [["that"]], []], ["being", [["that"]], [["said"]]], ["think", [["to"], ["come"]], [["of"], ["it", "that"]]], ["doing", [["in"]], [["so"]]], ["more", [["'s", "is"], ["what"]], []], ["believe", [], [["it"], ["or"], ["not"]]], ["said", [["i"], ["as"]], []], ["say", [["to"], ["is"], ["that"]], []], ["be", [["to"]], [["exact", "precise"]]], ["example", [["an"], ["take"], ["to"]], []], ["point", [["my"], ["make", "further"], ["to"]], []], ["conclude", [["to"]], []], ["to", [], [["begin", "start"], ["with"]]], ["clear", [["be"], ["to"]], []], ["matters", [["make"], ["to"]], [["worse"]]], ["reminds", [["which"]], [["me"]]], ["speak", [["to"], ["so"]], []], ["fact", [["the"]], [["is"]]], ["another", [["it"], ["put"], ["to"]], [["way"]]], ["failing", [], [["that"]]], ["luck", [["as"]], [["would"], ["have"], ["it"]]], ["happened", [["it"], ["as"]], []], ["as", [], [["it"], ["is"]]], ["see", [["I"], ["as"]], [["it"]]], ["speaking", [], [["of"], ["which"]]], ["thing", [["the"]], [["is"]]], ["fact", [["the"]], [["is"]]], ["believe", [], [["me"]]], ["were", [["it"], ["as"]], []], ["suffice", [], [["it"], ["to"], ["say"]]], ["that", [], [["'s", "is"], ["why", "how"]]], ["come", [], [["on"]]], ["mean", [["i"]], []], ["barring", [], [["that"]]], ["suppose", [["i"]], []],  ["problem", [["only"], ["the"]], [["is"]]], ["heard", [["'ve", "have"], ["i"]], []], ["hear", [["i"]], []], ["say", [["they"], ["so"]], []]]

attribution_cues = [["attrib-verb", [["NNP","NN","PRP","NPP","NNS"]], []],   ["attrib-verb", [], [["NNP","NN","NNS","PRP","NNPS"], ["NNP","NN","NNS","NNPS"]]], ["attrib-verb", [["NNP","NN","PRP","NPP","NNS"], ["NNP","NN","NNS","NNPS"]], []],   ["attrib-verb", [], [["NNP","NN","NNS","PRP","NPP"]]],  ["attrib-verb", [["NN","NNS"], ["PRP$","DT"]], []],   ["attrib-verb", [], [["PRP$","DT"], ["NN","NNP","NNS"]]], ["attrib-verb", [["NN","NNS"], ["JJ","NN","NNS"], ["PRP$","DT"]], []],   ["attrib-verb", [], [["PRP$","DT"], ["JJ","NN","NNS"], ["NN","NNP","NNS"]]]]

# get verbs restricting our Sbar complement rule from verblist.txt file
verbfile = open("toverblist.txt", "r")
sbar_complement_toverblist_for_rule3 = verbfile.read().split("\n")
verbfile.close()
verbfile = open("ifverblist.txt", "r")
sbar_complement_ifverblist_for_rule3 = verbfile.read().split("\n")
verbfile.close()

#list of attribution verbs
attrverbfile = open("attributionverbs.txt", "r")
list_of_attr_verbs = attrverbfile.read().split("\n")
attr_in_sentence = [False]

# get prepositions for restricting our Sbar rules from badpreplist.txt
prepfile = open("badpreplist.txt", "r")
sbar_complement_preplist_for_rule3 = prepfile.read().split("\n")
prepfile.close()


def test_surface_rule(rule, current_index, list_of_words_and_tags):
	'''
	test_surface_rule() checks to see if the current_index satisfies the surface break rule of index_rule_number which involves 5 comparisons: 
	
	1st, see if the current word is the key word of our restricted phrase i.e. if our word matches "extent" in the phrase "to the extent of"
	2nd, for each thing that must appear before the key word (if any), see if one of the possibilities is the word and/or tag at the relevant index i.e. check that both "to" and "the" appear before "extent" in our text
	3rd, do the same thing as second except for each thing that must appear after (if any) i.e. check that "of" appears after "extent" in our text
	4th, make sure the word or tag before the first required word is not a disallowed word/tag
	5th, same as 4th except we are looking at disallowed words occurring after our first required word
	
	if all criteria are satisfied, the function returns the index relative to current_index where a breakrule should be inserted, otherwise test_surface_rule() returns -9999
	'''
	(key, stuff_before, stuff_after, not_before, not_after, relative_insertion_index) = rule
	if key == list_of_words_and_tags[current_index].split("/")[0].lower():
		i = 0
		# check if the number of words/tags we require before our keyword is greater than our current index's starting value at the beginning of a sentence
		if current_index < len(stuff_before):
			return -9999
		# iterate thru all the words/tags that are required before our keyword
		while i < len(stuff_before):
			possibilities = stuff_before[i]
			found = False
			for possibility in possibilities:
				# check if we have a word with its tag
				if "/" in possibility:
					found = (found or possibility == list_of_words_and_tags[current_index - i - 1])
				# check if we have just a tag
				elif possibility.isupper():
					found = (found or possibility == list_of_words_and_tags[current_index - i - 1].split("/")[1])
				# check if we have just a word
				else:
					found = (found or possibility == list_of_words_and_tags[current_index - i - 1].split("/")[0])
			if not found:
				return -9999
			i += 1
		
		i = 0
		# check if the number of words/tags we require after our keyword would take our index past the end of the sentence
		if current_index + len(stuff_after) >= len(list_of_words_and_tags):
			return -9999
		# iterate thru all the words/tags that are required after our keyword
		while i < len(stuff_after):
			possibilities = stuff_after[i]
			found = False
			for possibility in possibilities:
				# check if we have a word with its tag
				if "/" in possibility:
					found = (found or possibility == list_of_words_and_tags[current_index + i + 1])
				# check if we have just a tag
				elif possibility.isupper():
					found = (found or possibility == list_of_words_and_tags[current_index + i + 1].split("/")[1])
				# check if we have just a word
				else:
					found = (found or possibility == list_of_words_and_tags[current_index + i + 1].split("/")[0])
			if not found:
				return -9999
			i += 1
		
		# checks whether any of the words/tags before our current word are in our list of disallowed words/tags
		elements_disallowed_before_keyword = current_index - len(stuff_before) - 1
		if elements_disallowed_before_keyword >= 0:
			(word_disallowed_before_keyword, tag_disallowed_before_keyword) = list_of_words_and_tags[elements_disallowed_before_keyword].split("/")
			if word_disallowed_before_keyword.lower() in not_before or tag_disallowed_before_keyword in not_before:
				return -9999
		
		# checks whether any of the words/tags after our current word are in our list of disallowed words/tags
		elements_disallowed_after_keyword = current_index + len(stuff_after) + 1
		if elements_disallowed_after_keyword < len(list_of_words_and_tags):
			(word_disallowed_after_keyword, tag_disallowed_after_keyword) = list_of_words_and_tags[elements_disallowed_after_keyword].split("/")
			if word_disallowed_after_keyword.lower() in not_after or tag_disallowed_after_keyword in not_after:
				return -9999
		
		return relative_insertion_index
		
	return -9999


def stem_VB(VB, type):
	'''
	stems the verb given its POS tag
	
	@return: the stemmed verb
	@param VB: is the verb
	@param type: is the 3rd element of its tag
	'''
	VB = VB.lower()
	sbar_complement_verblist_for_rule3 = sbar_complement_toverblist_for_rule3 + sbar_complement_ifverblist_for_rule3 + list_of_attr_verbs
	if type == "" or type == "P" or len(VB) < 4 or VB in sbar_complement_verblist_for_rule3:
		return VB
	elif type == "D" or type == "N":
		if VB[-1] == "d":
			VB = VB[:-1]   #  loved -> love 
			if not VB in sbar_complement_verblist_for_rule3:
				if VB[-1] == "e":
					VB = VB[:-1]   # enjoyed -> enjoy
				if not VB in sbar_complement_verblist_for_rule3:
					if VB[-1] == "i":
						VB = VB[:-1] + "y" # tried -> try
					elif len(VB) > 1 and VB[-1] == VB[-2]:
						VB = VB[:-1]   # compelled -> compel
		return VB
	elif type == "G":
		VB = VB[:-3] # obeying -> obey
		if not VB in sbar_complement_verblist_for_rule3:
			if len(VB) > 1 and VB[-1] == VB[-2]:
				VB = VB[:-1] # stopping -> stop
			else:
				VB = VB + "e" # amusing -> amuse
		return VB
	elif type == "Z" and len(VB) > 3:
		if VB[-1] == "s":
			VB = VB[:-1]  # likes -> like
			if VB not in sbar_complement_verblist_for_rule3 and VB[-1] == "e":
				VB = VB[:-1]  # watches -> watch
				if VB not in sbar_complement_verblist_for_rule3 and VB[-1] == "i":
					VB = VB[:-1] + "y"  # flies -> fly
		return VB


def segmenter(t, parent_node):
	'''
	takes a tree as input and segments the tree according to our hand-written break rules
	
	@param t: is a tree, this is the output from the Charniak parser
	'''
	
	#if a VP/ADJP has a child that is an S/SBAR/SBARQ  (and the S/SBAR/SBARQ does not have a restricted verb that is under an immediate child that is a VBP), then break before S/SBAR/SBARQ (making sure to only break if the VP/ADJP starts with a VB*/AUX/AUXG/JJ)
	#if 1st element of SBAR/S is tagged IN and is not in our restricted complement prepostion wordlist, then break regardless of no VB in VP (if the word is "if/whether/when/where" then the verb in the VP cannot be in our blacklist; do not break if we have "as" or "because" where the verb is tagged as AUX/AUXG; we also do not break on "as if" (unless there is a comma before "as if") and also handle cases where there is an adverb between the comma and "as if"; do not break on things like "are never quite sure if" where the VP here is headed by an AUX
	#if 1st element of SBAR/S is tagged TO and is not in our restricted TO wordlist (or the verb is not tagged as VBN/AUX/AUXG/JJ), then break (the verb in the VP cannot be in our blacklist; reason we break on VBN is sometimes adj are tagged as VBN)
	#if 1st element of SBAR/S is tagged WRB/WHADVP, then break as long as the word is not "if/whether/when" (and the verb is not in our restricted complement list or is not tagged as AUX/AUXG) or the word is not "how/why/where"
	if t.node == "VP" or t.node == "ADJP":
		verb_in_verblist = False
		verb = ""
		tag = ""
		# check for a VB* or AUX/AUXG/JJ as the head of the VP/ADJP
		for i in range(len(t)):
			if t[i].node.startswith("VB") or t[i].node in ["AUX", "AUXG", "JJ"]:
				if t[i].node in ["AUX", "AUXG", "JJ"]:
					verb = t[i][0]
				else:
					verb = stem_VB(t[i][0], t[i].node[2:])
				tag = t[i].node
		#for i in range(len(t)):
		i = 0
		while i < len(t):
			if t[i].node in ["SBAR", "S", "SBARQ"]:
				first2elements = get_n_elements(t[i], 2)
				first3elements = get_n_elements(t[i], 3)
				if first2elements[0].node == "RB":
					key_element = first2elements[1]
				else:
					key_element = first2elements[0]
				if key_element.node == "IN" and key_element[0] not in sbar_complement_preplist_for_rule3:
					if (key_element[0].lower() == "if" or key_element[0].lower() == "whether" or key_element[0].lower() == "when" or key_element[0].lower() == "where") and (verb in sbar_complement_ifverblist_for_rule3 or t[0].node == "AUX"):
						pass
					elif (key_element[0].lower() == "as" or key_element[0].lower() == "because") and tag in ["AUX", "AUXG"]:
						pass
					elif first2elements[0][0] == "as" and first2elements[1][0] == "if":
						if  i > 0 and t[i-1].leaves()[-1] == ",":
							t.insert(i, nltk.Tree("HH", ["BREAKrule3"]))
					elif first3elements[0].node == "RB" and first3elements[1][0] == "as" and first3elements[2][0] == "if" and t[i-1].leaves()[-1] != ",":
						pass
					else:
						t.insert(i, nltk.Tree("HH", ["BREAKrule3"]))
						i = i + 1
				elif key_element.node == "TO":
					if verb in sbar_complement_toverblist_for_rule3 or tag in ["VBN", "AUX", "AUXG", "JJ"]:
						pass
					else:
						t.insert(i, nltk.Tree("HH", ["BREAKrule3"]))
						i = i + 1
				elif key_element.node in ["WRB", "WHADVP"]:
					if (key_element[0].lower() == "if" or key_element[0].lower() == "whether" or key_element[0].lower() == "when") and (verb in sbar_complement_ifverblist_for_rule3 or tag in ["AUX", "AUXG"]):
						pass
					elif (key_element[0].lower() == "how" or key_element[0].lower() == "why" or key_element[0].lower() == "where"):
						pass
					else:
						t.insert(i, nltk.Tree("HH", ["BREAKrule3"]))
						i = i + 1
			i = i + 1
	
	
	#if node is S/SBAR/SBARQ with commas on both sides, break after the 1st comma
	#if element before comma is an NP, mark the node for possible movement (but not relative clause; clause to be moved immediately after the second comma)
	for i in range(1, len(t)-1):
		if t[i].node == "S" or t[i].node == "SBAR" or t[i].node == "SBARQ":
			if i > 1 and (t[i-1].node == "," and t[i+1].node == ","):
				t.insert(i+2, nltk.Tree("HH", ["BREAKrule4"]))
				if t[i-2].node == "NP" and ((parent_node == "S" and t.node == "NP") or (t.node == "S" or t.node == "SINV")) :
						t.insert(i, nltk.Tree("HH", ["BREAKrule4a"]))
						t.insert(i-2, nltk.Tree("HH", ["BREAKrule4b"]))
				else:
					t.insert(i, nltk.Tree("HH", ["BREAKrule4"]))
				break
	
	#the following breaks happen only if the node is a VP with commas on both sides 
	#if there is more than 1 leaf, with the first word being an RB and the 2nd is VBG/VBN, break after the 2nd comma
	#and if there is a NP before the comma and the parent of the VP is NP and the NP's parent is S, then mark the NP (NP that is on the same level as the VP) for possible movement; otherwise just break after the 2nd comma
	for i in range(2, len(t)-1):
		if t[i].node == "VP" and (t[i-1].node == "," and t[i+1].node == ","):
			leafs_and_pos = t[i].pos()
			if (leafs_and_pos[0][1] == "VBG" or leafs_and_pos[0][1] == "VBN") or (len(leafs_and_pos) > 1 and leafs_and_pos[0][1] == "RB" and (leafs_and_pos[1][1] == "VBG" or leafs_and_pos[1][1] == "VBN")):
				t.insert(i+2, nltk.Tree("HH", ["BREAKrule4"]))
				if t[i-2].node == "NP" and ((parent_node == "S" and t.node == "NP") or (t.node == "S" or t.node == "SINV")):
					t.insert(i, nltk.Tree("HH", ["BREAKrule4a"]))
					t.insert(i-2, nltk.Tree("HH", ["BREAKrule4b"]))
				else:
					t.insert(i, nltk.Tree("HH", ["BREAKrule4"]))
				
	#if node is S/SBAR/SBARQ with quotes on both sides, break before and after quotes
	for i in range(1, len(t)-1):
		if t[i].node == "S" or t[i].node == "SBAR" or t[i].node == "SBARQ":
			if (t[i-1].node == "\"" and t[i+1].node == "\""):
				t.insert(i+2, nltk.Tree("HH", ["BREAKrule5"]))
				t.insert(i-1, nltk.Tree("HH", ["BREAKrule5"]))
	
	#if a PP has an S as a child and the first word is not in our restricted complement word list, break before the PP (make sure there is no RB/CC before the PP)
	#make sure S/SBAR/SBARQ/VP/VB/VBN/VBG/VBD/VBP/VBZ is on same level of PP
	nodes_on_same_level = []
	for i in range(0, len(t)):
		nodes_on_same_level.append(t[i].node)
	S_on_same_level_as_PP = False
	for n in ["S", "SBAR", "SBARQ", "VP", "VB", "VBP","VBN","VBG","VBZ","VBD"]:
		if n in nodes_on_same_level:
			S_on_same_level_as_PP = True
	i = 1
	while i < len(t):
		if t[i].node == "PP" and S_on_same_level_as_PP and (t[i-1].node != "CC" or t[i-1].node != "RB"):
			children_of_PP = []
			for j in range(len(t[i])):
				children_of_PP.append(t[i][j].node)
			first_element = t[i].leaves()[0].split("/")[0]
			if "S" in children_of_PP and first_element not in sbar_complement_preplist_for_rule3:
				t.insert(i, nltk.Tree("HH", ["BREAKrule11"]))
				i = i + 1
		i = i + 1
	#if a PP has an S as a child and a comma follows the PP and the first word is not in our restricted complement word list, break after the comma
	#and S/SBAR/SBARQ/VP on same level of PP
	i = 0
	while i < len(t)-1:
		if t[i].node == "PP" and S_on_same_level_as_PP:
			if t[i+1].node == ",":
				children_of_PP = []
				for j in range(len(t[i])):
					children_of_PP.append(t[i][j].node)
				first_element = t[i].leaves()[0].split("/")[0]
				if "S" in children_of_PP and first_element not in sbar_complement_preplist_for_rule3:
					t.insert(i+2, nltk.Tree("HH", ["BREAKrule11"]))	
		i = i + 1
	
	#if an S is directly under an SBAR/SINV/SQ and the first element is a CC, break before the S
	#this S directly under SBAR signifies an independent clause
	for i in range(0, len(t)):
		if(t.node == "SBAR" or t.node == "SINV" or t.node == "SQ"):
			if(t[i].node == "S"):
				if (t[i].pos()[0][1] == "CC"):
					t.insert(i, nltk.Tree("HH", ["BREAKrule12"]))
	
	#when NP follows SBAR/SBARQ and a comma, then break before NP
	for i in range(2, len(t)):
		if (t[i-2].node == "SBAR" or t[i-2].node == "SBARQ") and t[i-1].node == "," and t[i].node == "NP":
			t.insert(i, nltk.Tree("HH", ["BREAKrule13"]))
	
	# break when CC is directly between two VPs, and the CC is NOT "and"/"or"/"nor" and if there is a comma before the CC, check before the comma
	for i in range(1, len(t)-1):
		if t[i].node == "CC" and t[i][0] not in  ["or", "and", "nor"]:
			if t[i-1].node == ",":
				if (t[i-2].node == "VP" and t[i+1].node == "VP") or (t[i-2].node == "ADJP" and t[i+1].node == "ADJP"): # this could cause index out of error range if the clause starts with a comma followed by a CC
					t.insert(i, nltk.Tree("HH", ["BREAKrule14"]))
			if (t[i-1].node == "VP" and t[i+1].node == "VP") or (t[i-1].node == "ADJP" and t[i+1].node == "ADJP"):
				t.insert(i, nltk.Tree("HH", ["BREAKrule14"]))
	
	#first, if S/SINV/SQ has S/SBAR/SBARQ/SQ followed by a comma as children, and S/SBAR/SBARQ/SQ or VP/ADJP appear anywhere after the comma, then break after comma
	#second, if S has S/SBAR/SBARQ/SQ followed by a CC as children, and S/SBAR/SBARQ/SQ or NP and VP/ADJP appear anywhere after the comma, then break before CC
	#third, if S has S/SBAR/SBARQ/SQ as a child, and S/SBAR/SBARQ/SQ or NP and VP/ADJP appear anywhere after, then break after S/SBAR/SBARQ/SQ
	if t.node == "S" or t.node == "SINV" or t.node == "SQ":
		for i in range(1, len(t)):
			#here we check the case where we have a special ADVP beginning with "as soon as"
			as_soon_as_ADVP = False
			if t[i-1].node == "ADVP":
				first3elements = get_n_elements(t[i-1], 3)
				if first3elements[0][0].lower() == "as" and first3elements[1][0] == "soon" and first3elements[2][0] == "as":
					as_soon_as_ADVP = True
			if (t[i-1].node == "S" or t[i-1].node == "SBAR" or t[i-1].node == "SBARQ" or t[i-1].node == "SQ" or as_soon_as_ADVP) and (t[i].node == ","):
				tags_of_tokens_appearing_after_comma = []
				for j in range(i, len(t)):
					tags_of_tokens_appearing_after_comma.append(t[j].node)
				if "S" in tags_of_tokens_appearing_after_comma or "SBAR" in tags_of_tokens_appearing_after_comma or "SQ" in tags_of_tokens_appearing_after_comma or "SBARQ" in tags_of_tokens_appearing_after_comma or "VP" in tags_of_tokens_appearing_after_comma or "ADJP" in tags_of_tokens_appearing_after_comma:
					t.insert(i+1, nltk.Tree("HH", ["BREAKrule15"]))
			elif (t[i-1].node == "S" or t[i-1].node == "SBAR" or t[i-1].node == "SQ" or t[i-1].node == "SBARQ") and (t[i].node == "CC"):
				tags_of_tokens_appearing_after_CC = []
				for j in range(i, len(t)):
					tags_of_tokens_appearing_after_CC.append(t[j].node)
				if "S" in tags_of_tokens_appearing_after_CC or "SBAR" in tags_of_tokens_appearing_after_CC or "SBARQ" in tags_of_tokens_appearing_after_CC or "SQ" in tags_of_tokens_appearing_after_CC or ("NP" in tags_of_tokens_appearing_after_CC and ("VP" in tags_of_tokens_appearing_after_CC or "ADJP" in tags_of_tokens_appearing_after_CC)):
					t.insert(i, nltk.Tree("HH", ["BREAKrule15"]))
			elif (t[i-1].node == "S" or t[i-1].node == "SBAR" or t[i-1].node == "SQ" or t[i-1].node == "SBARQ"):
				tags_of_tokens_appearing_after_S = []
				for j in range(i, len(t)):
					tags_of_tokens_appearing_after_S.append(t[j].node)
				if "S" in tags_of_tokens_appearing_after_S or "SBAR" in tags_of_tokens_appearing_after_S or "SQ" in tags_of_tokens_appearing_after_S or "SBARQ" in tags_of_tokens_appearing_after_S or ("NP" in tags_of_tokens_appearing_after_S and ("VP" in tags_of_tokens_appearing_after_S or "ADJP" in tags_of_tokens_appearing_after_S)):
					t.insert(i, nltk.Tree("HH", ["BREAKrule15"]))
	
	# if S/SINV/SQ has VP as a child followed by S/SBAR/SBARQ anywhere after it, break after VP
	if t.node == "S" or t.node == "SINV" or t.node == "SQ":
		i = 1
		while i < len(t):
			if t[i-1].node == "VP":
				tags_of_tokens_appearing_after_VP = []
				for j in range(i, len(t)):
					tags_of_tokens_appearing_after_VP.append(t[j].node)
				if "S" in tags_of_tokens_appearing_after_VP or "SBAR" in tags_of_tokens_appearing_after_VP or "SBARQ" in tags_of_tokens_appearing_after_VP:
					t.insert(i, nltk.Tree("HH", ["BREAKrule16"]))
					i = i + 1
			i = i + 1
	
	#break after colons/semicolons/ellipsis
	for i in range(len(t)-1, 0, -1):
		if t[i][0] == ":" or t[i][0] == ";" or t[i][0] == "...":
			t.insert(i+1, nltk.Tree("HH", ["BREAKrule17"]))
	
	#first, if S/SBAR/SBARQ follows comma, then break after comma
	#	also breaks when there is a comma then a ADVP/RB then a S/SBAR/SBARQ/SINV
	#second, if S/SBAR/SBARQ follows CC, then break before comma
	#third, if CC follows comma and S/SBAR/SBARQ/VP appears anywhere after the comma, then break after comma but before the CC
	i = 1
	while i < len(t) - 1:
		if (t[i].node == "," and (t[i+1].node in ["S", "SBAR", "SBARQ", "SINV"] or ( i<len(t)-2 and t[i+2].node in ["S", "SBAR", "SBARQ", "SINV"] and t[i+1].node in ["ADVP", "RB"]))):
			t.insert(i+1, nltk.Tree("HH", ["BREAKrule18"]))
			i = i + 1
		elif  t[i].node == "CC" and (t[i+1].node in ["S", "SBAR", "SBARQ"]):
			t.insert(i, nltk.Tree("HH", ["BREAKrule18"]))
			i = i + 1
		elif t[i].node == "," and t[i+1].node == "CC":
			S_appears_after_comma = False
			for j in range(i+1, len(t)):
				if t[j].node in ["S", "SBAR", "SBARQ", "SINV", "VP"]:
					S_appears_after_comma = True
			if S_appears_after_comma:
				t.insert(i+1, nltk.Tree("HH", ["BREAKrule18"]))
				i = i + 1
		i = i + 1
	
	find_coord_VPs(t)
	find_attr_verbs(t)
	
	return t


def find_attr_verbs(t):
	if t.node == "VP":
		leaves = t.leaves()
		if len(leaves) == 1:
			leaf, tag = t[0][0], t[0].node
			leaf = stem_VB(leaf, tag[2:])
			if leaf in list_of_attr_verbs:
				attr_in_sentence[0] = True


def find_coord_VPs(t):
	i = 0
	coord_VPs = False
	while i < len(t)-2:
		if t[i].node == "VP" and t[i+1].node == "," and t[i+2].node == "VP":
			leaves = t.leaves()
			temp = "VP , VP\n"
			for leaf in leaves:
				temp = temp + " " + leaf
			temp = temp + "\n"
			VPoutfile.write(temp)
			coord_VPs = True
		if t[i].node == "VP" and t[i+1].node == "CC" and t[i+2].node == "VP":
			leaves = t.leaves()
			temp = "VP CC VP\n"
			for leaf in leaves:
				temp = temp + " " + leaf
			temp = temp + "\n"
			VPoutfile.write(temp)
			coord_VPs = True
		if t.node == "VP":
			children_of_VP = []
			for j in range(len(t)):
				children_of_VP.append(t[j].node)
			num_of_VPs = 0
			for j in range(len(children_of_VP)):
				if children_of_VP[j] == "VP":
					num_of_VPs = num_of_VPs + 1
			if num_of_VPs > 1:
				leaves = t.leaves()
				temp = "VP with more than 1 VP as children\n"
				for leaf in leaves:
					temp = temp + " " + leaf
				temp = temp + "\n"
				VPoutfile.write(temp)
				coord_VPs = True
			if num_of_VPs > 2:
				leaves = t.leaves()
				location_of_HHs = []
				for k in range(len(t)):
					if t[k].node == "HH":
						location_of_HHs.append(k)
				for k in range(len(location_of_HHs)-1, -1, -1):
					if t[location_of_HHs[k]+1][0] not in ["but"]:
						t.pop(location_of_HHs[k])
				temp = "VP with more than 2 VP as children\n"
				for leaf in leaves:
					temp = temp + " " + leaf
				temp = temp + "\n"
				VPoutfile.write(temp)
				coord_VPs = True
		i = i + 1
	i = 0
	while i < len(t)-3:
		if t[i].node == "VP" and t[i+1].node == "," and t[i+2].node == "CC" and t[i+3].node == "VP":
			leaves = t.leaves()
			temp = "VP , CC VP\n"
			for leaf in leaves:
				temp = temp + " " + leaf
			temp = temp + "\n"
			VPoutfile.write(temp)
			coord_VPs = True
		i = i + 1
	if coord_VPs == True:
		VPoutfile.write(f2name)
		VPoutfile.write("\n\n\n")


def traverse_nodes(t, parent_node=None):
	'''
	takes tree as input, recursively goes thru all subtrees and segments them
	stops segmenting if a subtree is a token/word
	
	@param t: tree as input
	'''
	leaves = t.leaves()
	if t[0] in leaves:
		if t[0] != "<PARAGRAPH>":
			t[0] = t[0] + "/" + t.node
		return
	else:
		segmenter(t, parent_node)
		for s in t:
			parent_node = t.node
			traverse_nodes(s, parent_node)
		


def remove_breakrules_before_period(l):
	temp = l
	if len(temp) > 2:
		while temp[-2] in break_rule_list:
			temp = temp[:-2] + temp[len(temp)-1:]
	return temp


def remove_consecutive_breakrules(leaves):
	temp = []
	for i in range(len(leaves)-1):
		if leaves[i] in break_rule_list and leaves[i+1] in break_rule_list and leaves[i] != "BREAKrule4/HH":
			pass
		elif (leaves[i-1] == "BREAKrule4b/HH" or leaves[i-1] == "BREAKrule4a/HH" or leaves[i-1] == "BREAKrule4/HH") and leaves[i] in break_rule_list:
			pass
		else:
			temp.append(leaves[i])
	temp.append(leaves[-1])
	return temp


def write_segments_to_files(t, fout, is_first_tree_to_avoid_starting_text_with_closing_paragraph):
	'''
	writes segmented sentences 2 output file
	we insert parentheticals when we encounter breakrule 4a
	every sentence gets wrapped in <S></S>
	every clause gets wrapped in <C></C>
	
	@param t: tree that has breakrules inserted as leaves
	@param f: output file 2 write to
	'''
	temp = ""
	temp2 = ""
	temp_for_rule4a = ""
	temp_for_rule4b = ""
	between_4a_and_4b = False
	l = remove_breakrules_before_period(t.leaves())
	l2 = remove_consecutive_breakrules(l)
	leaves = surface_pass(l2)
	
	for leaf in leaves:
		if leaf in break_rule_list:
			if leaf == "BREAKrule4/HH" and temp_for_rule4a != "":
				temp = "" + temp + "</C>\n" + leaf + "\n<C>" + temp_for_rule4a
				temp2 = "" + temp2 + "</C>\n" + "<C>" + temp_for_rule4a
				temp_for_rule_4a = ""
			else:
				leaf = "\n*** " + leaf + "\n" + "<C>"
				temp = temp  + "</C>" + leaf + " "
				temp2 = temp2 + "</C>" +  "\n" + "<C>"
		elif leaf == "BREAKrule4b/HH":
			between_4a_and_4b = True
			# <M> = moved clause
			temp =  temp + "\n" + "<M>" + leaf 
			temp2 = temp2 + "\n" + "<M>"
		elif leaf == "BREAKrule4a/HH":
			between_4a_and_4b = False
			#temp_for_rule4a = temp2
			# <M> = moved clause
			temp = temp + "</M>" + leaf + "\n" 
			temp2 = temp2 + "</M>" + "\n"
		elif leaf == "surfaceRule4b/HH":
			# <M> = moved clause
			temp = temp + "</M>" + leaf + "\n" 
			temp2 = temp2 + "</M>" + "\n"
		elif leaf == "surfaceRule4a/HH":
			# <M> = moved clause
			temp =  temp + "\n" + "<M>" + leaf 
			temp2 = temp2 + "\n" + "<M>"
		elif leaf =="break_within_paren/HH":
			temp =  temp + "\n" + "<breakWithinParens>" + leaf 
			temp2 = temp2 + "\n" + "<breakWithinParens>"
		else:
			if between_4a_and_4b:
				temp_for_rule4a = temp_for_rule4a + leaf + " "
				# if the segment being moved has a comma at the end, strip the comma
				# so as to avoid "Opus Dei, figure prominently into the equation"
				if temp_for_rule4a[-4:] == ",/, ":
					temp_for_rule4a = temp_for_rule4a[:-4]
			temp = temp  + leaf + " "
			temp2 = temp2  + leaf + " "
	
	
	if is_first_tree_to_avoid_starting_text_with_closing_paragraph is False:
		temp = ""
		temp2 = ""
	else:
		if temp == "<PARAGRAPH> " or temp == "<PARAGRAPH>" or temp == "<PARAGRAPH> ./. ":
			temp = "</P>\n<P>" + "\n"
		else:
			temp = "<S>\n<C>" + temp + "</C>\n</S>" + "\n"
		if temp2 == "<PARAGRAPH> " or temp2 == "<PARAGRAPH>" or temp2 == "<PARAGRAPH> ./. ":
			temp2 = "</P>\n<P>" + "\n"
		else:
			temp2 = "<S>\n<C>" + temp2 + "</C>\n</S>" + "\n"
	
	
	fout.write(temp)
	fout2.write(temp2)
	
	temp_with_all_breakrules_and_no_surfacerules = ""
	for x in t.leaves():
		temp_with_all_breakrules_and_no_surfacerules = temp_with_all_breakrules_and_no_surfacerules + " " + x
	temp_with_all_breakrules_and_no_surfacerules = temp_with_all_breakrules_and_no_surfacerules + "\n\n"
	fout3.write(temp_with_all_breakrules_and_no_surfacerules)


def get_n_elements(tree, n):
    if not isinstance (tree[0], nltk.tree.Tree):
        return [tree]
    else:
        i = 0
        results = []
        while i < len(tree) and len(results) < n:
            results += get_n_elements(tree[i], n-len(results))
            i+= 1
        return results


def delete_breakrules_for_phrasal_discourse_cues(rule, current_index, list_of_words_and_tags):
	'''
	this does...
	'''
	(key, stuff_before, stuff_after) = rule
	# check if the word is not the key
	# or key is an attribute verb and the word is in the list of attr verbs
	if key != list_of_words_and_tags[current_index].split("/")[0].lower() and not (key == "attrib-verb" and stem_VB(list_of_words_and_tags[current_index].split("/")[0].lower(), list_of_words_and_tags[current_index].split("/")[1][2:]) in list_of_attr_verbs):
		return -9999
	else:
		i = 0
		#testing to see if words before the keyword match the words before in our list
		if current_index < len(stuff_before):
			return -9999
		while i < len(stuff_before):
			possibilities = stuff_before[i]
			found = False
			for possibility in possibilities:
				if "/" in possibility:
					found = (found or possibility == list_of_words_and_tags[current_index - i - 1])
				elif possibility.isupper():
					found = (found or possibility == list_of_words_and_tags[current_index - i - 1].split("/")[1])
				else:
					found = (found or possibility == list_of_words_and_tags[current_index - i - 1].split("/")[0].lower())
			if not found:
				return -9999
			i += 1
		i = 0
		#testing to see if words after the keyword match the words after in our list
		if current_index + len(stuff_after) >= len(list_of_words_and_tags):
			return -9999
		while i < len(stuff_after):
			possibilities = stuff_after[i]
			found = False
			for possibility in possibilities:
				if "/" in possibility:
					found = (found or possibility == list_of_words_and_tags[current_index + i + 1])
				elif possibility.isupper():
					found = (found or possibility == list_of_words_and_tags[current_index + i + 1].split("/")[1])
				else:
					found = (found or possibility == list_of_words_and_tags[current_index + i + 1].split("/")[0].lower())
			if not found:
				return -9999
			i += 1
		before_index = current_index - len(stuff_before) -1
		after_index = current_index + len(stuff_after) + 1
		if before_index < 3 and list_of_words_and_tags[after_index].split("/")[0] == "," and list_of_words_and_tags[after_index+1] in break_rule_list:
				return after_index+1
		elif list_of_words_and_tags[after_index].split("/")[0] == "," and list_of_words_and_tags[before_index-1].split("/")[0] == "," and list_of_words_and_tags[before_index-2].split("/")[1] in ["AUX", "AUXG", "MD"] and list_of_words_and_tags[before_index] in break_rule_list:
			if list_of_words_and_tags[after_index+1] in break_rule_list:
				return (before_index, after_index+1)
			else:
				return before_index
		elif (list_of_words_and_tags[after_index].split("/")[1] in [",", ";", ":", "-", "!", "?", "(", ")", "."]) and list_of_words_and_tags[before_index] in break_rule_list and list_of_words_and_tags[before_index -1].split("/")[0] == ",":
				return before_index
		else:
			return -9999


def surface_pass(leaves):
	"""
	the purpose of this function is:
	
	"""
	# fix the problem with ellipses due to Charniak parser breaking them up and tagging them
	i = 0
	while i < (len(leaves) - 2):
		if leaves[i].split("/")[0] == ".." and leaves[i+1].split("/")[0] == ".":
			leaves.insert(i, ".../:")
			leaves = leaves[:i+1] + leaves[i+2:]
			leaves = leaves[:i+1] + leaves[i+2:]
		i = i + 1
		
	#
	# this section looks through the sentence and deletes breakrules occurring in a specific context.
	# index_of_last_breakrule keeps track of index of the last breakrule.
	# indices_of_previous_breakrules is a list of the indices of all encountered breakrules. when the breakrule at index_of_last_breakrule is deleted, the index of the previous breakrule is the new value for index_of_last_breakrule (-1 is used to indicate that there is no previous breakrule).
	# in the case that there are no clauses without verbs, found_verb_since_last stores whether a verb/auxiliary has been seen since the last break rule (or since the beginning of the sentence).
	# if we don't find a verb between two breakrules, delete the preceeding breakrule. if there is no preceding breakrule, delete the current breakrule 
	# if we find an "as" after a breakrule, look to see if there is another "as" within the range specified by as_window (indicating that this "as" is being used comparatively e.g. "as big as"), and delete the breakrule before the current "as". 
	# if we find two verbs broken by a CC, it deletes the breakrule before the CC (e.g. "I can't decide whether I love or hate this movie").
	# if we delete breakrule4, we also delete breakrules 4a and 4b
	#
	#
	if "<PARAGRAPH>" not in leaves:
		index_of_last_breakrule = -1
		indices_of_previous_breakrules = []
		# a window of words before and after the word "as" to look for another "as"
		# if as_window = 5, then we look 5 words before and 5 words after
		as_window = 6
		found_verb_since_last = False
		i = 0
		# delete breakrules
		deleted_breakRule4 = False
		while i < len(leaves):
			# found a breakrule
			if leaves[i] in break_rule_list:
				if not found_verb_since_last:
					if index_of_last_breakrule == -1:
						if leaves[i].split("/")[0] == "BREAKrule4":
							deleted_breakRule4 = True
						leaves = leaves[:i] + leaves[i+1:] # delete current breakrule
					else:
						if leaves[index_of_last_breakrule].split("/")[0] == "BREAKrule4":
							deleted_breakRule4 = True
						leaves = leaves[:index_of_last_breakrule] + leaves[index_of_last_breakrule+1:] # delete previous breakrule
						index_of_last_breakrule = i - 1
				# this is the case where we find a verb since last breakrule
				else:
					if index_of_last_breakrule != -1:
						indices_of_previous_breakrules.append(index_of_last_breakrule)
					index_of_last_breakrule = i
					i += 1
				found_verb_since_last = False
			# the word "as" follows a breakrule
			# so look to see if there is another "as" within the range specified by as_window, if so remove the breakrule
			elif leaves[i].split("/")[0] == "as" and i> 0 and leaves[i-1] in break_rule_list:
				start = max(0, i-as_window) 
				end = min(len(leaves) - 1, i + as_window)
				if "as/IN" in leaves[start:i-1] or "as/RB" in leaves[start:i-1] or "as/IN" in leaves[i+1:end] or "as/RB" in leaves[i+1:end]:
					if leaves[i-1] == "BREAKrule4/HH":
						deleted_breakRule4 = True
					leaves = leaves[:i-1] + leaves[i:]
					if indices_of_previous_breakrules:
						index_of_last_breakrule = indices_of_previous_breakrules[-1]
						indices_of_previous_breakrules = indices_of_previous_breakrules[:-1]
					else:
						index_of_last_breakrule = -1
				else:
					i+= 1
			# CC follows a breakrule
			# we check to make sure we don't break before the "or" in "to seriously injure, or even kill"
			# trying to handle cases of following form VB* CC VB* or VB*, CC VB* or VB*, CC RB VB*
			elif leaves[i].split("/")[1] == "CC" and i>1 and leaves[i-1] in break_rule_list:
				'''
				these are cases of following form VB* CC VB* or VB*, CC VB* or VB*, CC RB VB* that we are trying to handle
				1. to seriously injuring or even killing #FIXED
				2. to seriously injuring, or killing #FIXED
				3. to seriously injure or even kill #FIXED
				4. to seriously injure, or even kill #FIXED
				5. to be injured/VBD or even killing/VBG (tags differ) #FIXED
				6. to seriously injure or kill #this doesn't have a breakRule inserted
				7. to seriously injure, or kill #FIXED
				8. to injure or kill #this doesn't have a breakRule inserted
				9. to seriously injuring, or even killing #FIXED
				10. to seriously injured or even killed #FIXED
				no19.txt
				Such trappings suggest a glorious past but give no hint of a troubled present.
				This story would have us believe that the union would take any opportunity to seriously injure or even kill a middle aged single Mom.
				Their prodding was seriously injuring or even killing a middle aged single Mom.
				This story would have us believe that the union would take any opportunity to seriously injuring, or killing a middle aged single Mom.
				This story would have us believe that the union would take any opportunity to seriously injure or even kill a middle aged single Mom.
				This story would have us believe that the union would take any opportunity to seriously injure, or even kill a middle aged single Mom.
				This story would have us believe that the union would take any opportunity to be injured or even killing a middle aged single Mom.
				This story would have us believe that the union would take any opportunity to seriously injure or kill a middle aged single Mom.
				This story would have us believe that the union would take any opportunity to seriously injure, or kill a middle aged single Mom.
				This story would have us believe that the union would take any opportunity to injure or kill a middle aged single Mom.
				This story would have us believe that the union would take any opportunity to seriously injuring, or even killing a middle aged single Mom.
				This story would have us believe that the union would take any opportunity to seriously injured or even killed a middle aged single Mom.
				'''
				next1tag = leaves[i+1].split("/")[1]
				next2tag = leaves[i+2].split("/")[1]
				# previous1tag is the breakRule/HH
				previous2tag = leaves[i-2].split("/")[1]
				previous3tag = leaves[i-3].split("/")[1]
				
				if next1tag.startswith("VB") and (previous2tag.startswith("VB") or (previous2tag == "," and previous3tag.startswith("VB"))):
					if leaves[i].split("/")[0] == "BREAKrule4":
						deleted_breakRule4 = True
					leaves = leaves[:i-1] + leaves[i:]
					if indices_of_previous_breakrules:
						index_of_last_breakrule = indices_of_previous_breakrules[-1]
						indices_of_previous_breakrules = indices_of_previous_breakrules[:-1]
					else:
						index_of_last_breakrule = -1
				elif (next1tag == "RB" and next2tag.startswith("VB")) and ((previous2tag.startswith("VB") == next2tag.startswith("VB")) or (previous2tag == "," and previous3tag.startswith("VB") == next2tag.startswith("VB"))):
					if leaves[i].split("/")[0] == "BREAKrule4":
						deleted_breakRule4 = True
					leaves = leaves[:i-1] + leaves[i:]
					if indices_of_previous_breakrules:
						index_of_last_breakrule = indices_of_previous_breakrules[-1]
						indices_of_previous_breakrules = indices_of_previous_breakrules[:-1]
					else:
						index_of_last_breakrule = -1
				else:
					i = i + 1
			else:
				if leaves[i].split("/")[1] in ["VB", "VBP","VBN","VBG","VBZ","VBD", "AUX", "AUXG", "MD"]:
					found_verb_since_last = True
				i+=1
		# check if we deleted breakrule4 to handle the movements associated with 4a/4b
		# delete breakrule4a and 4b if we deleted breakrule4
		if deleted_breakRule4:
			j = 0
			while j < i:
				if leaves[j].split("/")[0] == "BREAKrule4a" or leaves[j].split("/")[0] == "BREAKrule4b":
					leaves = leaves[:j] + leaves[j+1:]
					i = i - 1
				j = j + 1
			deleted_breakRule4 = False
		# check last clause to see if there is a verb in it, since the above doesn't check the last clause but checks all previous ones
		if found_verb_since_last == False:
			if index_of_last_breakrule != -1 and leaves[index_of_last_breakrule] != "BREAKrule4/HH":
				leaves = leaves[:index_of_last_breakrule] + leaves[index_of_last_breakrule+1:] #delete previous
		
		#
		# insert a surface rule only if there is no breakrule inserted anywhere in the previous 3 words
		# we only insert one surfacerule per word in our surfacerule list
		# if there is a CC then RB located immediately before where we want to insert a surfacerule (eg. "and not because"), then insert the surfacerule before the CC
		# if there is a CC or RB located immediately before where we want to insert a surfacerule (eg. "and because" or "only because"), then insert the surfacerule before the CC/RB
		# don't insert a surfacerule after "that/IN"
		#
		#
		i = 3
		while i < len(leaves) - 1:
			# we check to insert a surface rule only if there is no breakrule inserted anywhere in the previous 3 words
			if leaves[i-1].split("/")[1] != "HH" and leaves[i-2].split("/")[1] != "HH" and leaves[i-3].split("/")[1] != "HH":
				j = 0
				done = False
				# we make sure to only insert one surfacerule per word in our surfacerule list
				while j < len(surface_break_list) and not done:
					relative_insertion_index = test_surface_rule(surface_break_list[j], i, leaves)
					if relative_insertion_index != -9999:
						insertion_index = i + relative_insertion_index
						# now deal with cases like "and not because", "and only because", "and just because"
						if leaves[insertion_index - 1].split("/")[1] == "RB" and leaves[insertion_index - 2].split("/")[1] == "CC":
							insertion_index -= 2
						# deals with "and because", "only because" etc.
						elif leaves[insertion_index - 1].split("/")[1] == "RB" or leaves[insertion_index - 1].split("/")[1] == "CC":
							insertion_index -= 1
						# we insert a surfacerule if the word/tag before where we want to insert is not "that/IN"
						if leaves[insertion_index - 1] != "that/IN":
							leaves.insert(insertion_index, "surfaceRule1/HH")
							i = i + 1
							done = True #never insert more than one surface rule per key word
					j = j + 1
			i = i + 1
		
		#
		# delete breaks around phrasal discourse cues of type "as it stands" or "i believe", where there could be a break before or after the phrasal cue, but there shouldn't be
		#
		#
		i = 0
		while i < len(leaves) - 1:
			j = 0
			done = False
			while j < len(phrasal_discourse_cues) and not done:
				deletion_index = delete_breakrules_for_phrasal_discourse_cues(phrasal_discourse_cues[j], i, leaves)
				if deletion_index != -9999:
					if isinstance(deletion_index, tuple):
						x, y = deletion_index
						leaves = leaves[:y] + leaves[y+1:]
						deletion_index = x
					leaves = leaves[:deletion_index] + leaves[deletion_index +1:]
					done = True
					i -= 1
				j += 1
			i += 1
		
		#
		#
		#
		#
		#
		# Same, but for attribution verbs
		#
		i=0
		while i < len(leaves) - 1:
			j = 0
			done = False
			while j < len(attribution_cues) and not done:
				deletion_index = delete_breakrules_for_phrasal_discourse_cues(attribution_cues[j], i, leaves)
				if deletion_index != -9999:
					if isinstance(deletion_index, tuple):
						x, y = deletion_index
						leaves = leaves[:y] + leaves[y+1:]
						deletion_index = x
					leaves = leaves[:deletion_index] + leaves[deletion_index +1:]
					done = True
					i -= 1
				j += 1
			i += 1
		
		
		#
		# find matching LRB and RRB within sentence and wrap them in surfaceRule for possible movement, unless the whole sentence is a parenthetical i.e. LRB is first token and RRB is the last (or 2nd last) token
		# insert a surfaceRule before LRB and a surfaceRule after RRB, only if there is a VB/VBP/VBN/VBG/VBZ/VBD/AUX/AUXG between the parentheses
		# keep all breakrules and surfacerules between LRB and RRB (only if the clauses have a verb)
		#
		#
		whole_sentence_is_a_parenthetical = False
		if leaves[0].split("/")[0] == "-LRB-" and (leaves[len(leaves)-1].split("/")[0] == "-RRB-" or leaves[len(leaves)-2].split("/")[0] == "-RRB-"):
			whole_sentence_is_a_parenthetical = True
		i = 0
		j = 0
		while i < len(leaves) and not whole_sentence_is_a_parenthetical:
			if leaves[i].split("/")[0] == "-LRB-":
				found_verb_since_last = False
				for j in range(i+1, len(leaves)):
					if leaves[j].split("/")[0] == "-RRB-":
						parenthetical_already_tagged_as_clause = j < len(leaves)-1 and leaves[i-1].split("/")[1] == "HH" and leaves[j+1].split("/")[1] == "HH"
						k = i
						while k < j:
							if leaves[k].split("/")[1] in ["VB","VBP","VBN","VBG","VBZ","VBD","AUX","AUXG"]:
								found_verb_since_last = True
							#delete all breakrules/surfacerules betwen hyphens
							if leaves[k].split("/")[1] == "HH":
								index_to_check_for_verb_until_end = k + 1
								found_verb_after_current_breakrule = False
								while index_to_check_for_verb_until_end < j and leaves[index_to_check_for_verb_until_end].split("/")[1] != "HH":
									if leaves[index_to_check_for_verb_until_end].split("/")[1] in ["VB","VBP","VBN","VBG","VBZ","VBD","AUX","AUXG"]:
										found_verb_after_current_breakrule = True
									index_to_check_for_verb_until_end = index_to_check_for_verb_until_end + 1
								if parenthetical_already_tagged_as_clause:
									pass
								elif found_verb_since_last and found_verb_after_current_breakrule and leaves[index_to_check_for_verb_until_end].split("/")[0] not in ["BREAKrule4a", "BREAKrule4b"]:
									leaves[k] = "break_within_paren/HH"
								else:
									leaves = leaves[:k] + leaves[k+1:]
									j = j - 1
									k = k - 1
							k = k + 1
						if found_verb_since_last and not parenthetical_already_tagged_as_clause:
							leaves.insert(j+1, "surfaceRule4b/HH")
						break
				if found_verb_since_last and not parenthetical_already_tagged_as_clause:
					leaves.insert(i, "surfaceRule4a/HH")
					i = i + 1
			i = i + 1
		
		#
		# when we have matching dashes we wrap them in surfaceRule for possible movement, similar to how we handle parentheses
		# a single unmatched dash gets matched to the end of the sentence
		# there must be a VB/VBP/VBN/VBG/VBZ/VBD/AUX/AUXG between the two dashes (or dash and end of sentence)
		# keep all breakrules and surfacerules between hyphens (only if the clauses have a verb)
		#
		#
		i = 0
		j = 0
		while i < len(leaves):
			if leaves[i].split("/")[0] == "-":
				first_dash_location = i
				found_verb_since_last = False
				for j in range(i+1, len(leaves)):
					#match a dash to either a second dash or end of sentence
					if leaves[j].split("/")[0] == "-" or leaves[j].split("/")[1] == ".":
						parenthetical_already_tagged_as_clause = j < len(leaves)-1 and leaves[first_dash_location-1].split("/")[1] == "HH" and leaves[j+1].split("/")[1] == "HH"
						k = i + 1
						while k < j:
							if leaves[k].split("/")[1] in ["VB","VBP","VBN","VBG","VBZ","VBD","AUX","AUXG"]:
								found_verb_since_last = True
							#delete all breakrules/surfacerules betwen hyphens
							if leaves[k].split("/")[1] == "HH":
								index_to_check_for_verb_until_end = k + 1
								found_verb_after_current_breakrule = False
								while index_to_check_for_verb_until_end < j and leaves[index_to_check_for_verb_until_end].split("/")[1] != "HH":
									if leaves[index_to_check_for_verb_until_end].split("/")[1] in ["VB","VBP","VBN","VBG","VBZ","VBD","AUX","AUXG"]:
										found_verb_after_current_breakrule = True
									index_to_check_for_verb_until_end = index_to_check_for_verb_until_end + 1
								if parenthetical_already_tagged_as_clause:
									pass
								elif found_verb_since_last and found_verb_after_current_breakrule and leaves[index_to_check_for_verb_until_end].split("/")[0] not in ["BREAKrule4a", "BREAKrule4b"]:
									leaves[k] = "break_within_paren/HH"
								else:
									leaves = leaves[:k] + leaves[k+1:]
									j = j - 1
									k = k - 1
							k = k + 1
						if found_verb_since_last and not parenthetical_already_tagged_as_clause:
							if leaves[j].split("/")[1] == ".":
								leaves.insert(j, "surfaceRule4b/HH")
								if leaves[first_dash_location-1] in break_rule_list:
									leaves = leaves[:first_dash_location-1] + leaves[first_dash_location:]
									first_dash_location = first_dash_location - 1
									j = j - 1
							else:
								leaves.insert(j+1, "surfaceRule4b/HH")
						i = j
						break
				if found_verb_since_last and not parenthetical_already_tagged_as_clause:
					leaves.insert(first_dash_location, "surfaceRule4a/HH")
					i = i + 1
			i = i + 1
	return leaves


if __name__ == '__main__':
	finput = open(sys.argv[1], "r")
	f2name = sys.argv[2] + ".temp"
	f3name = sys.argv[2] + ".temp_with_all_breakRules"
	fout = open(f2name, "w") # file with breakrules in text
	fout2 = open(sys.argv[2], "w")
	fout3 = open(f3name, "w") # file with ALL breakrules in text, even consecutive ones
	list_of_sentences = finput.read().split("\n\n")
	
	fout.write("<T>\n<P>\n")
	fout2.write("<T>\n<P>\n")
	
	VPoutfile = open("coordVPsentences.txt", "a")
	attrVBfile = open("attrVPsentences.txt", "a")
	
	for i in range(len(list_of_sentences)-1):
		attr_in_sentence[0] = False
		list_of_sentences[i] = list_of_sentences[i].strip("Sentence skipped: no PCFG fallback.\nSENTENCE_SKIPPED_OR_UNPARSABLE\n")
		tree_of_a_sentence = nltk.bracket_parse(list_of_sentences[i])
		traverse_nodes(tree_of_a_sentence)
		#tree_of_a_sentence.draw() #used for debugging via nltk gui
		if attr_in_sentence[0]:
			temp = ""
			leaves = tree_of_a_sentence.leaves()
			temp = "attr:\n"
			for leaf in leaves:
				temp = temp + " " + leaf
			temp = temp + "\n"
			attrVBfile.write(temp)
			attrVBfile.write(f2name)
			attrVBfile.write("\n\n\n")
			attr_in_sentence[0] = False
		write_segments_to_files(tree_of_a_sentence, fout, i)
		
	guide = "BREAK RULES\
	1. if preceding element is S or S-bar and there is a VP anywhere after comma/connective, then break at comma or connective \n\
	2. not used\n\
	3. if S/S-Bar is a child of VP (and the Sbar does not have a restricted verb that is under an immediate child that is a VBP), then break before any S/S-bar under VP\n\
	   if VP has no VB, do not break\n\
	   if 1st element of SBAR/S is an IN (and is not a \"that\"), then break regardless of no VB in VP\
	   we never break on \"that\"\
	4. if node is S/S-bar with commas on either side, remove the node\n\
	   if element before is an adverb or CC, don't break at first comma but do at second\n\
	   but do break at both commas/quotes if 1st element before comma is NP (but not relative clause), move it to after the second comma surrounded by parentheses\n\
	5. if node is S/S-bar with commas or quotes or parentheses on either side, remove the node\n\
	   but do break at both commas/quotes if 1st element before comma is NP (but not relative clause), move it to after the second comma surrounded by parentheses\n\
	6. break between VBN and S\n\
	7. not used\n\
	8. not used\n\
	9. not used\n\
	10. break when S/SBAR is a child of PP\n\
	11. not used\n\
	12. not used\n\
	13. break when NP follows SBAR and ,\n\
	14. break when CC is directly between two VPs, and the CC is NOT \"and\" and a comma before the CC\n \
	15. break if a CC or comma has an S before it, and S, SBAR, or VP after it \n \
	16. break on colons/semicolons\n \
	17. break on SBAR following comma\n"
	
	fout.write("</P>\n</T>\n")
	fout2.write("</P>\n</T>\n")
	fout.write(guide)
	finput.close()
	fout.close()
	fout2.close()
	fout3.close()