

#Pattern Derivatives v 0.1
#Author and Copyright Casey S. Schroeder
#
#Released under the MIT License
#
#Compendium of initial efforts to code fomulas relevant to solving the problem of efficiently counting
#embedded sequences and as further relates to the matter of pricing Pattern Derivatives
#
#Mindful of the differences here.  Compared with the paper a = L, n = k, i = t, and m = n.  This is similar to some
#vocabulary on pattern matching in the computer science literature that i used when initially investigating the problem.
#(simultaneously, note that n has a different meaning) 


import math


#********************************************************************************************************
#The following is the exact basic recursive probability formula used in the basic pattern derivative case,
#and it's summation, as used in the European Options case.
# 
#Mindful of the differences here.  Compared with the paper a = L, n = k, i = t, and m = n.  This is similar to some
#vocabulary on pattern matching in the computer science literature that i used when initially investigating the problem.
#epsilon here just provides a set of indicies
#********************************************************************************************************

def epsilon_p(p, q):
	'''finds the self-overlap index (1-based) set for the given pattern (string); needs modification with 
	decorators so that a hash of it is made and is not recomputed each time'''
	n = len(p)
	l2 = len(q)	
	if n != l2: raise ValueError("length of patterns in set do not match")
	ind = []
	for i in range(0,n-1):
		flag = True
		for j in range(0, i+1):
			if q[j]!=p[n-i-1+j]:
				flag = False
				break
#			print p[j],p[n-i-1+j],flag
		if flag == True:
			ind.append(i+1)
#			print i
	return ind


def epsilon(p, q, wc='*'):
	'''finds the self-overlap index (1-based) set for the given pattern (string); needs modification with 
	decorators so that a hash of it is made and is not recomputed each time.  
	does not assume p and q have same length, but will not match at any index greater than length min(p, q)
	i.e. will not provide an index if p is in q or q in p, unless q is at end of p or p at beginning of q
	it is used where it is assumed that one pattern is not embedded in another, except if p is a prefix to an mseries
	'''
	n = len(p)
	l2 = len(q)
	ind = []
	for i in range(0,n):
		if i>=l2-1: break
		flag = True
		for j in range(0, i+1):
			if q[j]!=p[n-i-1+j] and wc!=p[n-i-1+j] and q[j]!=wc:
				flag = False
				break
		if flag == True:
			ind.append(i+1)
	return ind

def epsilon_m(p, q):
	'''finds the self-overlap index (1-based) set for the given pattern (string); needs modification with 
	decorators so that a hash of it is made and is not recomputed each time.  
	does not assume p and q have same length, but will not match at any index greater than length min(p, q)
	i.e. will not provide an index if p is in q or q in p, unless q is at end of p or p at beginning of q
	it is used where it is assumed that one pattern is not embedded in another, except if p is a prefix to an mseries
	'''
	n = len(p)
	l2 = len(q)
	ind = []
	for i in range(0,n):
		if i>=l2-1: break
		flag = True
		for j in range(0, i+1):
			if q[j]!=p[n-i-1+j]:
				flag = False
				break
		if flag == True:
			ind.append(i+1)
	return ind


def pr_first_occ_at_index(n, i, a, epsilon):
	'''basic recursive function indicated in the text, used subsequently in all valuation formulas
	accurately shows that the exponent in the last term should be n-i+j and not n-i-j.  Mindful of 
	the differences here.  Compared with the paper a = L, n = k, i = t, and epsilon is the e function, but 
	here it is just a list'''

	if i < n: return 0

	nooversum = 0
	if i >= 2*n:
		for j in range(n, i-n+1):
			nooversum -= pr_first_occ_at_index(n, j, a, epsilon)

	oversum = 0
	if n > 1:
		for j in range(i-n+1, i-1+1):
			if (n-i+j) in epsilon:
				oversum -= pr_first_occ_at_index(n, j, a, epsilon)*pow(a,n-i+j)
		'''slightly improved version of oversum calc 
			if n > 1:
				for e in epsilon:
					j = (e + 1) - n + i
					oversum -= pr_first_occ_at_index(n, j, a, epsilon)*pow(a,n-i+j)'''

	return (1+nooversum+oversum)*pow(a,-n)


def pr_sum_recursive(m, p, a):
	'''uses the exact method indicated in the paper for the base case of one-pattern European Options
	called recursive here, because it uses the basic recursive function indicated in the text'''
	total = 0
	n = len(p)
	eps = epsilon(p, p)
	for i in range(n,m+1):
		total += pr_first_occ_at_index(n,i,a, eps)
	return total,eps

#********************************************************************************************************
# fundamental recursive function indicated in the text for sets of patterns, indicating probability that any pattern
# from the set occurs at a given time.  It is useful for constructing the disjunctive (V) multi-pattern derivative.  
# 
#Mindful of the differences here.  Compared with the paper a = L, n = k, i = t, and m = n.  This is similar to some
#vocabulary on pattern matching in the computer science literature that i used when initially investigating the problem.
#Epsilon is here a function returning indicies.
# 
#********************************************************************************************************


def pr_first_occ_at_index_nset(p, i, nset, a):
	'''basic recursive function indicated in the text for sets of patterns.  It is useful for constructing the
	disjunctive (V) multi-pattern derivative.  Again, compared with the paper a = L, n = k, i = t, and epsilon 
	is the e function, but here it is just a list'''
	n = len(p)

	if i < n: return 0

	nooversum = 0
	if i >= 2*n:
		for q in nset:
			for j in range(n, i-n+1):
				nooversum -= pr_first_occ_at_index_nset(q, j, nset, a)

	oversum = 0
	'''see single pattern version for possible optimization to the following'''
	if n > 1:
		for q in nset:
			for j in range(i-n+1, i-1+1):
				if (n-i+j) in epsilon(q,p):
					oversum -= pr_first_occ_at_index_nset(q, j, nset, a)*pow(a,n-i+j)

	return (1+nooversum+oversum)*pow(a,-n)


def pr_sum_recursive_nset(m, nset, a):
	'''uses the exact method indicated in the paper for the base case of n-pattern European Options
	called recursive here, because it uses the basic recursive function indicated in the text'''
	total = 0
	n = len(nset[0])
	for p in nset:
		for i in range(n,m+1):
			total += pr_first_occ_at_index_nset(p,i,nset,a)
	return total

#********************************************************************************************************
# *Market Tradable Case*
# recursive function indicated in the text for sets of patterns in the *market tradable case*, 
# indicating probability that a pattern from the set occurs at a given time, including times prior to time m = n,
# given a prior set of events.

#Mindful of the differences here.  Compared with the paper a = L, n = k, i = t, and m = n.  This is similar to some
#vocabulary on pattern matching in the computer science literature that i used when initially investigating the problem.
#Epsilon is here a function returning indicies.
# 
#********************************************************************************************************

def pr_first_occ_at_index_nset_leading_values(p, i, nset, a, w):
	'''same as pr_first_occ_at_index_nset but uses w as the prefix, indicating an initial series of values which
	have already occurred. '''
	n = len(p)
	leadingsum = 0
	nooversum = 0
	oversum = 0
	if i <= 0 or n == 0: 
		return 0
	elif i <= n:
		if n-i in epsilon(w,p) or i==n:
			leadingsum = pow(a,-i)
			for q in nset:
				for j in epsilon(q,p):
					#if the following condition holds, then the existance of p at i *forces* the existance of a prior q
					#in which case this cannot be the first occurrance from nset
					if len(q)+n - (i + j) in epsilon(w, q): leadingsum = 0
					#if j <= i-1: 
					#leadingsum -= pr_first_occ_at_index_nset_leading_values(q, i-(n-j), nset, a, w)*pow(a,-i+(n-j))
		return leadingsum
	#elif i < 2*n:
	else: 
		if i > n:
			for q in nset:
				for j in range(1, i-n+1):
					nooversum -= pr_first_occ_at_index_nset_leading_values(q, j, nset, a, w)
	
		'''see single pattern version for possible optimization to the following'''
		if n > 1:
			for q in nset:
				for j in range(i-n+1, i-1+1):
					if (n-i+j) in epsilon(q,p):
						oversum -= pr_first_occ_at_index_nset_leading_values(q, j, nset, a, w)*pow(a,n-i+j)

		return (1+nooversum+oversum)*pow(a,-n)

	return 0

val_dict = dict()

def pr_first_occ_at_index_nset_leading_values_wc(p, i, nset, a, w, wc='*'):
	'''same as pr_first_occ_at_index_nset but uses w as the prefix, indicating an initial series of values which
	have already occurred. '''
	vals = 	(p, i, tuple(nset), a, w, wc)
	if val_dict.has_key(vals):
		return val_dict[vals]
	n = len(p)
	leadingsum = 0
	nooversum = 0
	oversum = 0
	if i <= 0 or n == 0: 
		val_dict[vals] = 0
		return 0
	elif i <= n:
		if n-i in epsilon(w,p) or i==n:
			leadingsum = 1
			for q in nset:
				for j in epsilon(q,p):
					#if the following condition holds, then the existance of p at i *forces* the existance of a prior q
					#in which case this cannot be the first occurrance from nset
					if len(q)+n - (i + j) in epsilon(w, q): return 0
					#if j <= i-1: 
					#leadingsum -= pr_first_occ_at_index_nset_leading_values(q, i-(n-j), nset, a, w)*pow(a,-i+(n-j))
		val = pow(a, -i+p[n-i:n].count(wc))*leadingsum
		val_dict[vals] = val
		return val
	#elif i < 2*n:
	else: 
		if i > n:
			for q in nset:
				for j in range(1, i-n+1):
					nooversum -= pr_first_occ_at_index_nset_leading_values_wc(q, j, nset, a, w, wc)

		'''see single pattern version for possible optimization to the following'''
		if n > 1:
			for q in nset:
				for j in range(i-n+1, i-1+1):
					if (n-i+j) in epsilon(q,p):
						#needs testing for a != 2 and modification in the event of multiple wildcards... in all *much* work but appears to work for one wc in one string
						oversum -= pr_first_occ_at_index_nset_leading_values_wc(q, j, nset, a, w, wc)*pow(a,n-i+j - q[len(q)-(n-i+j):len(q)].count(wc) - p[0:len(p)-(n-i+j)].count(wc))
		val = (1+nooversum+oversum)*pow(a,-(n- p.count(wc)))
		val_dict[vals] = val
		return val
	return 0


def pr_sum_recursive_nset_with_leading_values(m, nset, a, w):
	'''m is length of series in which a member of nset is to be found.  nset is the set of patterns, all of the same
	size.  a is the size of the alphabet.  w is a prefix to m, which must be the same size as the patterns in nset.
	for instance, if we are dealing with a derivative with 20 periods to expiry, with a contract period of 25+ days
	and patterns of length 5 (and it has yet to be struck) then insert m = 20, the set of patterns = nset, the size 
	of the alphabet (a = 2 for {'u','d'}) and w equal to a string of the five previous values, from t = 2 to t=6, e.g.
	"udduud".  If you are at a t < 5, say t=3, you can use "xxxud", where 'x' must not occur in the alphabet.
	'''
	total = 0
	for p in nset:
		for i in range(1,m+1):
			total += pr_first_occ_at_index_nset_leading_values_wc(p,i,nset,a, w)
	return total


def pr_sum_recursive_nset_with_leading_values_american_mixed(m, q_i, q_e, a, w):
	'''
	used in valuation of mixed american options.  Same as standard, but sum only over q_i among q_i U q_e
	'''
	total = 0
	union = list(q_i)
	union.extend(q_e)
	for p in q_i:
		for i in range(1,m+1):
			total += pr_first_occ_at_index_nset_leading_values_wc(p,i,union,a, w)
	return total


def pr_sum_recursive_nset_with_leading_values_euro_mixed(m, q_i, q_e, a, w):
	'''
	used in valuation of mixed european options.  Same as standard, but sum only over q_i among q_i U q_e
	'''
	total = 0
	union = list(q_i)
	union.extend(q_e)
	for p in q_i:
		summa = 0
		for i in range(1,m+1):
			val = pr_first_occ_at_index_nset_leading_values_wc(p,i,union,a, w)
			subval = 0
			for q in q_e:
				for j in range(1, m-i+1):
					subval += pr_first_occ_at_index_nset_leading_values_wc(q,j,q_e,a, p)
			summa += (val * (1 - subval))
		total += summa
	return total

#********************************************************************************************************
# *Valuation formulas*
# The following formulas are for use in the standard form of the non-mixed pattern option case;
# as it is non-mixed, these formulas only take one pattern clause, nset (Q, in the paper).
# mixed versions are easily provided by adding the payment component of these formulas to the logic
# in the mixed probability formuals provided in the previous section (i.e. functions ending with "_mixed")
#********************************************************************************************************

def full_inclusive_american(m, nset, a, w, payoff, rate, period=365):	
	import math
	total = 0
	totprob = 0
	for p in nset:
		for i in range(1,m+1):
			prob = pr_first_occ_at_index_nset_leading_values_wc(p,i,nset,a, w)
			total += prob*math.exp(-rate*i/period)
	return total*payoff

def full_inclusive_european(m, nset, a, w, payoff, rate, period=365):
	import math
	total = 0
	for p in nset:
		for i in range(1,m+1):
			total += pr_first_occ_at_index_nset_leading_values_wc(p,i,nset,a, w)
	return total*payoff*math.exp(-rate*m/period)

def full_exclusive_european(m, nset, a, w, payoff, rate, period=365):
	import math
	total = 0
	for p in nset:
		for i in range(1,m+1):
			total += pr_first_occ_at_index_nset_leading_values_wc(p,i,nset,a, w)
	return (1-total)*payoff*math.exp(-rate*m/period)

net_dict = dict()
def iterative_inclusive_american_no_prefix_no_wait(m, nset, a, payoff, rate, period=365):
	#in this version, if there is a match, then there is a payout equal to "payoff" plus a new derivative with the same
	#specifications, which starts immediately following the match and runs to the end of the term, but does not include
	#the matching pattern as a prefix.
	import math
	if m<1 : return 0
	vals = 	(m, tuple(nset), a, payoff, rate, period)
	if net_dict.has_key(vals): return net_dict[vals]
	total = 0
	totprob = 0
	for p in nset:
		for i in range(1,m+1):
			adj_payoff = payoff + iterative_inclusive_american(m-i-len(p), nset, a, payoff, rate, period)
			prob = pr_first_occ_at_index_nset_leading_values_wc(p,i,nset,a, '')
			total += adj_payoff*prob*math.exp(-rate*i/period)
	return total

net_dict2 = dict()
def iterative_inclusive_american_prefix_no_wait(m, nset, a, w, payoff, rate, period=365):
	#in this version, if there is a match, then there is a payout equal to "payoff" plus a new derivative with the same
	#specifications, which starts immediately following the match and runs to the end of the term, and includes the 
	#matching pattern as a prefix.
	import math
	if m<1 : return 0
	vals = 	(m, tuple(nset), a, payoff, rate, period)
	if net_dict2.has_key(vals): return net_dict2[vals]
	total = 0
	totprob = 0
	for p in nset:
		for i in range(1,m+1):
			adj_payoff = payoff + iterative_inclusive_american(m-i-len(p), nset, a, p, payoff, rate, period)
			prob = pr_first_occ_at_index_nset_leading_values_wc(p,i,nset,a, w)
			total += adj_payoff*prob*math.exp(-rate*i/period)
	return total

net_dict3 = dict()
def iterative_inclusive_american_no_prefix_wait(m, nset, a, payoff, rate, wait, period=365):
	#in this version, if there is a match, then there is a payout equal to "payoff" plus a new derivative with the same
	#specifications, which starts some period "wait" following the match and runs to the end of the term, and does not 
	#include the matching pattern as a prefix.
	import math
	if m<1 : return 0
	vals = 	(m, tuple(nset), a, payoff, rate, period)
	if net_dict3.has_key(vals): return net_dict3[vals]
	total = 0
	totprob = 0
	for p in nset:
		for i in range(1,m+1):
			adj_payoff = payoff + iterative_inclusive_american(m-i-len(p)-wait, nset, a, payoff, rate, wait, period)
			prob = pr_first_occ_at_index_nset_leading_values_wc(p,i,nset,a, w)
			total += adj_payoff*prob*math.exp(-rate*i/period)
	return total

m=24
alpha = 2
arr = list()
vals = list()
baseline=pr_sum_recursive(m,"01111111",alpha)
baseline=baseline[0]
vals.append(pr_sum_recursive(m,"11111111111111",alpha))
vals.append(pr_sum_recursive(m,"10000001000000",alpha))
vals.append(pr_sum_recursive(m,"00000011000000",alpha))
#vals.append(pr_sum_recursive(m,"101101",alpha))
#vals.append(pr_sum_recursive(m,"011101",alpha))
#vals.append(pr_sum_recursive(m,"001100",alpha))
#vals.append(pr_sum_recursive(m,"011110",alpha))
arr.append(((vals[0][0]-baseline)*(pow(2,24))))
arr.append(((vals[1][0]-baseline)*(pow(2,24))))
arr.append(((vals[2][0]-baseline)*(pow(2,24))))
#arr.append(((vals[3][0]-baseline))/sum(vals[3][1]))
#arr.append(((vals[4][0]-baseline))/sum(vals[4][1]))
#arr.append(((vals[5][0]-baseline))/sum(vals[5][1]))


#the case of a wait and a prefix together is slightly more difficult.  You must make the loop for time variable.

'''
#validation script.  Runs brute function, assumed accurate but slow, against the recursive function for 
#probability.  Probability embedded in the brute force function return value text, easy to see, but modify return
#values for automated checking.
from BrutePatterns import *
#print pr_sum_recursive_nset_with_leading_values(30, ["aaaaa"], 2, "xxxxx")
#print brute_nset_m_series_with_prefix_count_greaterthan_or_exactly_c_copies(10, ['a','b'], ["aba"], 1, "bba")
tot = [["x", "d"],["xx", "du"],["xxx", "duu"],["xxxx", "duuu"],["xxxxx", "duuuu", ],["xxxxxx", "duuuuu"]]
#f = file("C:\Users\Do It\Desktop\print_out.txt",'w')
#for s in tot:
#	for x in s[1:]:
#		for i in range(1,16):
#			f.writelines(x+": "+ str(i) + ": ")
			#f.writelines(str(pr_sum_recursive_nset_with_leading_values(i, [x], 2, s[0]))+": ")
#			f.writelines(str(brute_nset_m_series_with_prefix_count_greaterthan_or_exactly_c_copies_onecounts(i, ['u','d'], [x], 1, s[0]))+'\n')
#f.close()
#... and so on.  This validates the most general case. 

def patterns(n, k, a):
	if k > n or n < 1 or k < 1: return 0.0
	if k == 1: return 1.0-(pow(a-1.0, n)/pow(a,n))
	if k == n: return pow((1.0/a),n)
	l = patterns(n-1, k, a) # chance that the k pattern occurs in the prior n-1
	v = patterns(n-1, k-1, a) # chance that the k-1 preceding pattern occurs in the prior n-1
	c = patterns(n-2, k-1, a) # chance that the k-1 pattern occurs in the prior n-2
	print l, v, c
	#chance in prior n-1, + chance that did not occur in prior n-1 * chance the preceding k-1 first occurs at n-1
	# * the probability of getting the correct letter
	return l + (1-l)*(v*(1 - c))*(1.0/a)

'''