import time import random from random import shuffle import sys from collections import deque from math import sqrt import math #import numpy as np sys.setrecursionlimit(100000) class Graph(object): def __init__(self): self.nodes = [] self.matrix= {} self.listTest=[] self.D_EdgeFrequency={} def sigmoid(self,x): return 1 / (1 + math.exp(-x)) def logit(self,p): return math.log(p) - math.log(1 - p) def preprocessingSoloPositivi(self, edgeList, query): for startNode,destNode in edgeList: D_e_frequency = {} self.provePreprocessing(startNode, destNode, max([len(x) for x in query]), query, D_e_frequency) for elem in D_e_frequency: if (elem in self.D_EdgeFrequency): self.D_EdgeFrequency[elem] = self.D_EdgeFrequency[elem]+1 else: self.D_EdgeFrequency[elem]=1 def load(self, path): self.matrix = {} for line in open(path, 'r'): data = line.strip('\r\n').split(' ') if data[0]=='e': self.matrix[int(data[1])].setdefault(data[4],[]) self.matrix[int(data[1])][data[4]].append((int(data[2]),float(data[3]))) elif data[0]=='v': self.nodes[int(data[1])]= data[2] self.matrix.setdefault(int(data[1]),{}) else: self.cols = int(data[0]) self.nodes = [] for i in range(self.cols): self.nodes.append("") #print ' done!' def fromExamplesToList(self, nomeFile,classNodeList): edgesTestList={} for line in open(nomeFile, 'r'): data = line.strip('\r\n').split(' ') edgesTestList[(int(data[1]),int(data[2]))]=classNodeList.index(int(data[2])) return edgesTestList def fromExamplesToList2(self, nomeFile,classNodeList): edgesTestList={} for line in open(nomeFile, 'r'): print line data = line.strip('\r\n').split(' ') edgesTestList[(int(data[1]),int(data[2]))]=classNodeList.index(data[4]) return edgesTestList def loadTest(self, pathT): print "Loading test from file %s..." % pathT, for lineT in open(pathT, 'r'): dataT = lineT.strip('\r\n').split(' ') if dataT[0]=='e': self.listTest.append([int(dataT[1]),int(dataT[2]),dataT[4]]) print ' done!' def exact(self, startNode, endNode, query): return self._exact(startNode, startNode, endNode, [], [], [], len(query), len(query), query) def _exact(self, start, startNode, endNode, visited, Negated, Positive, startDepth, depth, query): if (depth == 0 and startNode == endNode): return 1.0 if depth == 0: return 0.0 visitedNew = visited + [startNode] for Node in self.matrix[startNode]: if (Node[2] == query[len(query)-depth] and \ not (startNode, Node[0]) in Negated and \ not Node[0] in visited): nextNode = Node if nextNode: target = nextNode[0] ProbEdge = nextNode[1] if not (startNode, target) in Positive: PositiveNew = Positive + [(startNode,target)] PositiveNew.append((target,startNode)) else: PositiveNew = Positive probLeft = self._exact(start, target, endNode, visitedNew, Negated, PositiveNew, startDepth, depth-1,query) if probLeft != 0.0: if not (startNode, target) in Positive: NegatedNew = Negated + [(startNode,target)] NegatedNew.append((target,startNode)) probRight = self._exact(start, start, endNode, [], NegatedNew, Positive, startDepth, startDepth, query) return (ProbEdge*probLeft + (1.0-ProbEdge)*probRight) else: return probLeft return 0.0 def prove(self, startNode, endNode, visited, sampler, k): if k == 0: return True visited.append(startNode) self._visitedNodes = self._visitedNodes + 1 for [target,probEdge, EdgeLabel] in self.matrix[startNode]: if not target in visited: self.prove(target, endNode, visited, sampler, k-1) visited.remove(startNode) return True def sampled_as_true(self,Sampler, prevNode, target, probEdge): Sampled = Sampler.get((prevNode, target), -1) if Sampled != -1: if Sampled==1: return True else: P = random.random() if P <= probEdge: Sampler[(prevNode, target)] = 1 return True else: Sampler[(prevNode, target)] = 0 return False def proveMCW(self, startNode, endNode, maxDepth, query): if maxDepth == 1: return True depth = 0 visited = [0] * maxDepth prevLabels = [""] * maxDepth visited[0] = startNode Stack = deque() Sampler = dict() EdgeLabels = set([q[depth] for q in query if len(q)>depth]) L = self.matrix[startNode] Stack.extend([(target,probEdge, EdgeLabel, depth+1) for EdgeLabel in EdgeLabels for (target,probEdge) in L.get(EdgeLabel, []) ]) while Stack: (target,probEdge, EdgeLabel, depth) = Stack.pop() prevLabels[depth-1] = EdgeLabel prevNode = visited[depth-1] #if not target in visited[:depth]: if True: #allow cycle if self.sigmoid(probEdge) < 1.0: Sampled = self.sampled_as_true(Sampler, prevNode, target, self.sigmoid(probEdge)) else: Sampled = True if Sampled: if target == endNode and (prevLabels[:depth] in query): return True if depth != maxDepth: visited[depth] = target EdgeLabels1 = set([q1[depth] for q1 in query if len(q1)>depth and q1[:depth] == prevLabels[:depth]]) L1 = self.matrix[target] Stack.extend([(target1,probEdge1, EdgeLabel1,depth+1) for EdgeLabel1 in EdgeLabels1 for (target1,probEdge1) in L1.get(EdgeLabel1, []) ]) return False def proveMC(self, startNode, endNode, maxDepth, query): if maxDepth == 1: return True depth = 0 visited = [0] * maxDepth prevLabels = [""] * maxDepth visited[0] = startNode Stack = deque() Sampler = dict() EdgeLabels = set([q[depth] for q in query if len(q)>depth]) L = self.matrix[startNode] Stack.extend([(target,probEdge, EdgeLabel, depth+1) for EdgeLabel in EdgeLabels for (target,probEdge) in L.get(EdgeLabel, []) ]) while Stack: (target,probEdge, EdgeLabel, depth) = Stack.pop() prevLabels[depth-1] = EdgeLabel prevNode = visited[depth-1] if not target in visited[:depth]: if probEdge < 1.0: Sampled = self.sampled_as_true(Sampler, prevNode, target, probEdge) else: Sampled = True if Sampled: if target == endNode and (prevLabels[:depth] in query): return True if depth != maxDepth: visited[depth] = target EdgeLabels1 = set([q1[depth] for q1 in query if len(q1)>depth and q1[:depth] == prevLabels[:depth]]) L1 = self.matrix[target] Stack.extend([(target1,probEdge1, EdgeLabel1,depth+1) for EdgeLabel1 in EdgeLabels1 for (target1,probEdge1) in L1.get(EdgeLabel1, []) ]) return False def infer(self, startNode, endNode, k, query): count = 0.0 for i in range(k): if self.proveMC(startNode, endNode, max([len(x) for x in query]), query): count = count + 1.0 Prob = count / k return Prob def inferW(self, startNode, endNode, k, query): count = 0.0 for i in range(k): if self.proveMCW(startNode, endNode, max([len(x) for x in query]), query): count = count + 1.0 Prob = count / k return Prob def stampa2(self,wF,pF): fileW = open(wF, 'w') file1 = open(pF, 'w') fileW.write(str(len(self.matrix))+'\n') file1.write(str(len(self.matrix))+'\n') for i in range(len(self.matrix)): fileW.write('v %d node\n' % i) file1.write('v %d node\n' % i) L={} for x in self.matrix: for y in self.matrix[x]: for z in self.matrix[x][y]: if (x,z[0]) not in L and (z[0],x) not in L: L[(x,z[0])]=(z[1],y) for sElem,dElem in L: fileW.write('e %s %s %s %s\n' % (sElem,dElem,L[sElem,dElem][0],L[sElem,dElem][1])) file1.write('e %s %s %s %s\n' % (sElem,dElem,self.sigmoid(L[sElem,dElem][0]),L[sElem,dElem][1])) fileW.close() file1.close() def stampa(self,sourceFile,wF,pF): fileW = open(wF, 'w') file1 = open(pF, 'w') L={} for x in self.matrix: for y in self.matrix[x]: for z in self.matrix[x][y]: L[(x,z[0],y)]=z[1] for line in open(sourceFile,'r'): data = line.strip('\r\n').split(' ') if data[0]=='v': fileW.write(line) file1.write(line) elif data[0]=='e': fileW.write('e %s %s %s %s\n' % (data[1],data[2],L[int(data[1]),int(data[2]),data[4]],data[4])) file1.write('e %s %s %s %s\n' % (data[1],data[2],self.sigmoid(L[int(data[1]),int(data[2]),data[4]]),data[4])) else: fileW.write(line) file1.write(line) fileW.close() file1.close() def sgd(self, startNode, endNode, k, q,tC,lrate,noUpdateList): Ledges=[] Df={} for rat in range(len(q)): Df[rat]={} for rat in range(len(q)): count = 0.0 query=[q[rat]] for i in range(k): if self.proveSGD(startNode, endNode, max([len(x) for x in query]), query,Df,rat): count = count + 1.0 Prob = count / k for edge in Df[rat]: Df[rat][edge]=(Prob,(Df[rat][edge]/(k*1.0))-((count-Df[rat][edge])/(k*1.0))) Ledges.append(edge) for sUp,dUp,label in set(Ledges): if label not in noUpdateList: val=0 for rat in range(len(q)): if (sUp,dUp,label) in Df[rat]: if (rat==(tC-1)): val=val+(Df[rat][(sUp,dUp,label)][0]-1)*Df[rat][(sUp,dUp,label)][1] else: val=val+Df[rat][(sUp,dUp,label)][0]*Df[rat][(sUp,dUp,label)][1] for x in range(len(self.matrix[sUp][label])): if (self.matrix[sUp][label][x][0]==dUp): if -10 < self.matrix[sUp][label][x][1]-(lrate*val) < 10 : self.matrix[sUp][label][x]=(self.matrix[sUp][label][x][0],self.matrix[sUp][label][x][1]-(lrate*val)) for x in range(len(self.matrix[dUp][label])): if (self.matrix[dUp][label][x][0]==sUp): if -10 < self.matrix[dUp][label][x][1]-(lrate*val) < 10 : self.matrix[dUp][label][x]=(self.matrix[dUp][label][x][0],self.matrix[dUp][label][x][1]-(lrate*val)) def sgd_nodeClass(self, startNode, endNode, k, queryList,tC,lrate,cicla,T,noUpdateList): Ledges=[] Df={} for rat in range(len(queryList)): Df[rat]={} for rat in range(len(queryList)): count = 0.0 for i in range(k): if self.proveSGD(startNode, endNode, max([len(x) for x in queryList[rat]]), queryList[rat],Df,rat): count = count + 1.0 Prob = count / (k+2) print '{0:.4f}'.format(Prob), for (sourceNodeS, destNodeS, labelEdgeS, probEdgeS) in Df[rat].keys(): if (Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][1]==0): del Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)] else: if (probEdgeS != 1.0): k1 = k + 2 unsampled = count - (Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][0] + Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][1]) Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)] = (Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][0] + unsampled * (1-probEdgeS), Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][1]) Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)] = (Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][0], Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][1] + unsampled * probEdgeS) pe = Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][1] / (k+2) Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)] = \ (Prob,(pe*(1-pe) * Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][1] /(k1*1.0)/pe )-(pe*(1-pe)*Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][0]/(k1*1.0)/(1-pe))) else: Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)] = (Prob, 999999999999999999) Ledges.append((sourceNodeS, destNodeS, labelEdgeS, probEdgeS)) print '' for sUp,dUp,label,prob in set(Ledges): if label not in noUpdateList: val = 0 for rat in range(len(queryList)): if (sUp,dUp,label,prob) in Df[rat]: if (rat==tC): val=val+(Df[rat][(sUp,dUp,label,prob)][0]-1)*((Df[rat][(sUp,dUp,label,prob)][1])) else: val=val+((Df[rat][(sUp,dUp,label,prob)][0])*((Df[rat][(sUp,dUp,label,prob)][1]))) for x in range(len(self.matrix[sUp][label])): if (self.matrix[sUp][label][x][0]==dUp): if not (self.matrix[sUp][label][x][1] > 10 or self.matrix[sUp][label][x][1] < -10) : anneling=1+(cicla/T) if (sUp,dUp,label) in self.D_EdgeFrequency: lrateNew=(lrate/((1.0+(self.D_EdgeFrequency[(sUp,dUp,label)]*1.0))/(1.0)))/anneling else: lrateNew=(lrate/(1.0/(1.0)))/anneling self.matrix[sUp][label][x]=(self.matrix[sUp][label][x][0],self.matrix[sUp][label][x][1]-(lrate*val)) if (self.matrix[sUp][label][x][1]>10): self.matrix[sUp][label][x] = (self.matrix[sUp][label][x][0],10) if (self.matrix[sUp][label][x][1]<-10): self.matrix[sUp][label][x] = (self.matrix[sUp][label][x][0],-10) for x in range(len(self.matrix[dUp][label])): if (self.matrix[dUp][label][x][0]==sUp): if not (self.matrix[dUp][label][x][1] > 10 or self.matrix[dUp][label][x][1] < -10) : self.matrix[dUp][label][x]=(self.matrix[dUp][label][x][0],self.matrix[dUp][label][x][1]-(lrate*val)) if (self.matrix[dUp][label][x][1]>10): self.matrix[dUp][label][x] = (self.matrix[dUp][label][x][0],10) if (self.matrix[dUp][label][x][1]<-10): self.matrix[dUp][label][x] = (self.matrix[dUp][label][x][0],-10) def sgd_nodeClass__old(self, startNode, endNodeList, k, query,tC,lrate,cicla,T,noUpdateList): Ledges=[] Df={} for rat in range(len(endNodeList)): Df[rat]={} for rat in range(len(endNodeList)): count = 0.0 for i in range(k): if self.proveSGD(startNode, endNodeList[rat], max([len(x) for x in query]), query,Df,rat): count = count + 1.0 Prob = count / (k+2) print '{0:.4f}'.format(Prob), for (sourceNodeS, destNodeS, labelEdgeS, probEdgeS) in Df[rat].keys(): if (Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][1]==0): del Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)] else: if (probEdgeS != 1.0): k1 = k + 2 unsampled = count - (Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][0] + Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][1]) Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)] = (Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][0] + unsampled * (1-probEdgeS), Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][1]) Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)] = (Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][0], Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][1] + unsampled * probEdgeS) pe = Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][1] / (k+2) Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)] = \ (Prob,(pe*(1-pe) * Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][1] /(k1*1.0)/pe )-(pe*(1-pe)*Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)][0]/(k1*1.0)/(1-pe))) else: Df[rat][(sourceNodeS, destNodeS, labelEdgeS, probEdgeS)] = (Prob, 999999999999999999) Ledges.append((sourceNodeS, destNodeS, labelEdgeS, probEdgeS)) print '' for sUp,dUp,label,prob in set(Ledges): if label not in noUpdateList: val = 0 for rat in range(len(endNodeList)): if (sUp,dUp,label,prob) in Df[rat]: if (rat==tC): val=val+(Df[rat][(sUp,dUp,label,prob)][0]-1)*((Df[rat][(sUp,dUp,label,prob)][1])) else: val=val+((Df[rat][(sUp,dUp,label,prob)][0])*((Df[rat][(sUp,dUp,label,prob)][1]))) for x in range(len(self.matrix[sUp][label])): if (self.matrix[sUp][label][x][0]==dUp): if not (self.matrix[sUp][label][x][1] > 10 or self.matrix[sUp][label][x][1] < -10) :#update weights rule if (dUp==1976): cClass=206.0 elif (dUp==1977): cClass=107.0 elif (dUp==1978): cClass=86.0 elif (dUp==1979): cClass=66.0 elif (dUp==1980): cClass=43.0 elif (dUp==1981): cClass=410.0 anneling=1+(cicla/T) if (sUp,dUp,label) in self.D_EdgeFrequency: lrateNew=(lrate/((1.0+(self.D_EdgeFrequency[(sUp,dUp,label)]*1.0))/(cClass+1.0)))/anneling else: lrateNew=(lrate/(1.0/(cClass+1.0)))/anneling self.matrix[sUp][label][x]=(self.matrix[sUp][label][x][0],self.matrix[sUp][label][x][1]-(lrateNew*val)) if (self.matrix[sUp][label][x][1]>10): self.matrix[sUp][label][x] = (self.matrix[sUp][label][x][0],10) if (self.matrix[sUp][label][x][1]<-10): self.matrix[sUp][label][x] = (self.matrix[sUp][label][x][0],-10) def proveSGD(self, startNode, endNode, maxDepth, query,Df,rat): if maxDepth == 1: return True depth = 0 visited = [0] * maxDepth prevLabels = [""] * maxDepth EdgeProbList = [0.0] * maxDepth visited[0] = startNode Stack = deque() Sampler = dict() NegSampled = [] EdgeLabels = set([q[depth] for q in query if len(q)>depth]) L = self.matrix[startNode] Stack.extend([(target,probEdge, EdgeLabel, depth+1) for EdgeLabel in EdgeLabels for (target,probEdge) in L.get(EdgeLabel, []) ]) while Stack: (target,probEdge, EdgeLabel, depth) = Stack.pop() prevLabels[depth-1] = EdgeLabel EdgeProbList[depth-1] = self.sigmoid(probEdge) prevNode = visited[depth-1] if True: if self.sigmoid(probEdge) < 1.0: Sampled = self.sampled_as_true(Sampler, prevNode, target, self.sigmoid(probEdge)) else: Sampled = True if Sampled: if target == endNode and (prevLabels[:depth] in query): visitedN=visited[:depth] visitedN.append(endNode) lt_visited=len(visitedN) z=0 for iEdge in range(lt_visited): if iEdge+1depth and q1[:depth] == prevLabels[:depth]]) L1 = self.matrix[target] Stack.extend([(target1,probEdge1, EdgeLabel1,depth+1) for EdgeLabel1 in EdgeLabels1 for (target1,probEdge1) in L1.get(EdgeLabel1, []) ]) else: elem = (prevNode, target, EdgeLabel, self.sigmoid(probEdge)) NegSampled.append(elem) return False def stampaFrequenze(self): for s,d,label in self.D_EdgeFrequency: if (label!='hasword' and label!='linkto' and label!='class'): print s,d,label,self.D_EdgeFrequency[(s,d,label)] def preprocessing(self, edgeList, query): for startNode,destNode in edgeList: dList=[2006,2007,2008,2009,2010,2011] for i in dList: D_e_frequency = {} self.provePreprocessing(startNode, i, max([len(x) for x in query]), query, D_e_frequency) for elem in D_e_frequency: if (elem in self.D_EdgeFrequency): self.D_EdgeFrequency[elem] = self.D_EdgeFrequency[elem]+1 else: self.D_EdgeFrequency[elem]=1 def provePreprocessing(self, startNode, endNode, maxDepth, query, D_e_frequency): if maxDepth == 1: return True depth = 0 visited = [0] * maxDepth prevLabels = [""] * maxDepth EdgeProbList = [0.0] * maxDepth visited[0] = startNode Stack = deque() Sampler = dict() NegSampled = [] EdgeLabels = set([q[depth] for q in query if len(q)>depth]) L = self.matrix[startNode] Stack.extend([(target,probEdge, EdgeLabel, depth+1) for EdgeLabel in EdgeLabels for (target,probEdge) in L.get(EdgeLabel, []) ]) while Stack: (target,probEdge, EdgeLabel, depth) = Stack.pop() prevLabels[depth-1] = EdgeLabel EdgeProbList[depth-1] = self.sigmoid(probEdge) prevNode = visited[depth-1] if True: Sampled=True if Sampled: if target == endNode and (prevLabels[:depth] in query): visitedN=visited[:depth] visitedN.append(endNode) lt_visited=len(visitedN) z=0 for iEdge in range(lt_visited): if iEdge+1depth and q1[:depth] == prevLabels[:depth]]) L1 = self.matrix[target] Stack.extend([(target1,probEdge1, EdgeLabel1,depth+1) for EdgeLabel1 in EdgeLabels1 for (target1,probEdge1) in L1.get(EdgeLabel1, []) ]) return False