commonsense team mailing list archive
-
commonsense team
-
Mailing list archive
-
Message #00030
[Merge] lp:~r-launchpad-jayantkrish-com/divisi/analogies into lp:divisi
Jayant has proposed merging lp:~r-launchpad-jayantkrish-com/divisi/analogies into lp:divisi.
Requested reviews:
Commonsense Computing (commonsense)
This is the code for CrossBridge analogies. crossbridge.py contains the actual analogy algorithm, and semantic_network.py contains a semantic network implementation.
--
https://code.launchpad.net/~r-launchpad-jayantkrish-com/divisi/analogies/+merge/6868
Your team Commonsense Computing is subscribed to branch lp:divisi.
=== added file 'csc/divisi/crossbridge.py'
--- csc/divisi/crossbridge.py 1970-01-01 00:00:00 +0000
+++ csc/divisi/crossbridge.py 2009-05-28 18:16:29 +0000
@@ -0,0 +1,318 @@
+from collections import defaultdict
+from labeled_tensor import SparseLabeledTensor
+from semantic_network import SemanticNetwork
+import copy
+import logging
+
+class CrossBridge(object):
+ '''
+ This class implements the CrossBridge analogy algorithm.
+
+ Typical usage:
+ cnet_graph = cnet_graph_from_tensor(cnet_tensor)
+ crossbridge = CrossBridge(cnet_graph) # This may compute for a bit
+ # Then one of the following
+ analogies, top_relations = crossbridge.analogy(set(['bird', 'wing', 'fly', 'sky']))
+ analogies, top_relations = crossbridge.analogy_from_concept('bird')
+ '''
+
+ def __init__(self, graph, num_nodes=3, min_graph_edges=3,
+ min_feature_edges=2, max_feature_edges=3, svd_dims=100,
+ logging_interval=None):
+
+ self.num_nodes = num_nodes
+ self.graph = graph
+ self.tensor = self._build_analogy_tensor(graph, num_nodes, min_graph_edges,
+ min_feature_edges, max_feature_edges,
+ logging_interval=logging_interval)
+ self.svd = self.tensor.svd(k=svd_dims)
+
+
+ def _build_analogy_tensor(self, graph, num_nodes, min_graph_edges,
+ min_feature_edges, max_feature_edges,
+ logging_interval=None):
+
+ '''
+ Builds a tensor T whose rows correspond to maximal dense subgraphs of
+ graph and whose columns are relation structures (n-ary relations
+ formed by combining multiple binary relations).
+
+ graph - The graph from which the tensor is built
+ num_nodes - The size of each vertex set row
+ min_graph_edges - The minimum number of edges in the graphs being indexed
+ min_feature_edges - The minimum number of edges to include in a graph feature
+ max_feature_edges - The minimum number of edges to include in a graph feature
+
+ Note: the min_graph_edges parameter treats the graph as if it
+ is undirected and untyped, meaning there is at most 1 edge
+ between 2 vertices
+ '''
+ k_subgraphs = k_edge_subgraphs(graph, min_graph_edges, num_nodes,
+ logging_interval=logging_interval)
+ rel_tensor = SparseLabeledTensor(ndim=2)
+
+ logging_counter = 0
+ for size, subgraphs_vertices in k_subgraphs.iteritems():
+ if size[0] == num_nodes and size[1] >= min_graph_edges:
+ logging.debug("Adding graphs with %d vertices and %d edges (%d total)...",
+ size[0], size[1], len(subgraphs_vertices))
+ for subgraph_vertices in subgraphs_vertices:
+ subgraph = graph.subgraph_from_vertices(subgraph_vertices)
+ for subgraph_no_repeats in subgraph.enumerate_without_repeated_edges():
+ if (len(subgraph_no_repeats.edges()) >= min_feature_edges
+ and len(subgraph_no_repeats.edges()) <= max_feature_edges):
+ for vertex_order in enumerate_orders(subgraph_vertices):
+ vm = dict([(y, x) for x, y in enumerate(vertex_order)])
+ edges = frozenset([(vm[v1], vm[v2], value) for v1, v2, value in subgraph_no_repeats.edges()])
+ rel_tensor[vertex_order, edges] = 1
+
+ if logging_interval is not None and logging_counter % logging_interval == 0:
+ logging.debug("%r", subgraph_vertices)
+ logging_counter += 1
+
+ return rel_tensor
+
+
+ def analogy(self, source_concept_set, logging_interval=None,
+ num_candidates=300, beam_width=10000):
+ '''
+ Takes a set of source concepts and returns a sorted, scored
+ list of analogies for the source concepts and the most
+ important relations.
+
+ num_candidates and beam_width are parameters that control the
+ number of candidate analogies maintained at various points during
+ the search for good analogies.
+
+ Note: The important relations are computed by a very hacky
+ algorithm... I (jayant) don't trust them very much
+ '''
+
+ '''
+ First search for analogies between self.num_nodes concepts.
+ '''
+ candidate_target_concept_sets = []
+ candidate_relations = defaultdict(lambda: 0)
+ for concept_triple in k_subset_iter(self.num_nodes, source_concept_set):
+ try:
+ concept_triple = tuple(concept_triple)
+ candidates = self.svd.u_angles_to(self.svd.weighted_u[concept_triple, :]).top_items(num_candidates)
+ candidate_target_concept_sets.extend([(frozenset(zip(target_triple, concept_triple)), score) for target_triple, score in candidates])
+
+ inferred_relations = self.svd.v_angles_to(self.svd.weighted_u[concept_triple, :]).top_items(num_candidates)
+ for relations, score in inferred_relations:
+ for c1, c2, rel in relations:
+ candidate_relations[(concept_triple[c1], rel, concept_triple[c2])] += score
+
+ logging.debug("Found source concept set: %r", concept_triple)
+ logging.debug("Top 10 analogies: %r", candidates[:10])
+ logging.debug("Top 10 relations: %r", inferred_relations[:10])
+ except KeyError:
+ pass
+
+ '''
+ Iteratively merge the candidate analogies together by combining
+ n-concept analogies containing 2 overlapping concepts.
+ '''
+ smallest_analogies = copy.copy(candidate_target_concept_sets)
+ candidates = [set(candidate_target_concept_sets)]
+ iteration_num = 0
+ while len(candidates[-1]) != 0:
+ logging.debug("Starting iteration %d (%d candidates)", iteration_num, len(candidates[-1]))
+ iteration_num += 1
+
+ new_candidates = set()
+ candidate_scores = {}
+ candidates_with_children = set()
+ for mapping, score in candidates[-1]:
+ for mapping2, score2 in smallest_analogies:
+ if (len(mapping & mapping2) == 2 and
+ len(set([x for x, y in mapping]) & set([x for x, y in mapping2])) == 2 and
+ len(set([y for x, y in mapping]) & set([y for x, y in mapping2])) == 2):
+ new_candidate = mapping | mapping2
+ if new_candidate in new_candidates:
+ candidate_scores[new_candidate] = max(score + score2, candidate_scores[new_candidate])
+ else:
+ new_candidates.add(new_candidate)
+ candidate_scores[new_candidate] = score + score2
+ candidates_with_children.add((mapping, score))
+
+ candidates[-1] = candidates[-1] - candidates_with_children
+ logging.debug("terminal candidates: %d", len(candidates[-1]))
+ new_candidates = set([(x, candidate_scores[x]) for x in new_candidates])
+
+ if beam_width is not None:
+ ''' Prune the current batch of candidates '''
+ new_candidates = list(new_candidates)
+ new_candidates.sort(key=lambda x: x[1], reverse=True)
+ new_candidates = set(new_candidates[:beam_width])
+
+ candidates.append(new_candidates)
+
+ all_candidates = list(reduce(lambda x, y: y.union(x), candidates, set()))
+
+ ''' Remove duplicate target sets '''
+ seen_targets = defaultdict(set)
+ for candidate in all_candidates:
+ seen_targets[frozenset([x for x, y in candidate[0]])].add(candidate)
+
+ filtered_candidates = []
+ for target_candidate, base_candidates in seen_targets.iteritems():
+ filtered_candidates.append(max(base_candidates, key=lambda x:x[1]))
+
+ filtered_candidates.sort(key=lambda x: x[1], reverse=True)
+
+ return filtered_candidates, sorted(candidate_relations.items(), key=lambda x: x[1], reverse=True)
+
+
+ def analogy_from_concept(self, source_concept, logging_interval=None,
+ num_candidates=100,
+ beam_width=None):
+ '''
+ Finds an analogy for a single concept. The algorithm simply
+ uses the CrossBridge.analogy method to find analogies for
+ neighbors of source_concept.
+
+ (This procedure is hacky because not all of the neighboring
+ concepts are important for the analogy)
+ '''
+
+ # Get the concepts that are a part of the chosen concept
+ related_concepts = set(self.graph.get_neighbors(source_concept))
+ related_concepts.add(source_concept)
+
+ logging.debug("Concept analogy: %r", source_concept)
+ logging.debug("looking up: %r", related_concepts)
+ # Find an analogy with these concepts
+ return self.analogy(related_concepts,
+ logging_interval=logging_interval, num_candidates=num_candidates,
+ beam_width=beam_width)
+
+
+def cnet_graph_from_tensor(cnet, delete_reltypes=None,
+ assertion_weight_threshold=0):
+ '''
+ This is a sort of hacky way to transform the typical
+ ConceptNet concept / feature matrix into a SemanticNetwork.
+
+ assertion_weight_threshold - only assertions with scores greater than the threshold are included in the graph.
+ delete_reltypes - a list of relationship types that shouldn't be included in the graph
+ '''
+
+ concepts = set(cnet.dim_keys(0))
+
+ cnet_graph = SemanticNetwork()
+ for assertion, score in cnet.iteritems():
+ if score < assertion_weight_threshold:
+ continue
+
+ if assertion[1][0] == 'left':
+ # left feature
+ c1 = assertion[1][2]
+ c2 = assertion[0]
+ r = assertion[1][1]
+ else:
+ # right feature
+ c1 = assertion[0]
+ c2 = assertion[1][2]
+ r = assertion[1][1]
+
+ cnet_graph.add_edge(c1, c2, r)
+
+ if delete_reltypes is not None:
+ logging.debug("Removing relations: %r", delete_reltypes)
+ edges = copy.copy(cnet_graph.edges())
+ for edge in edges:
+ if edge[2] in delete_reltypes:
+ cnet_graph.del_edge(edge)
+
+ return cnet_graph
+
+''' These are utility functions used by CrossBridge '''
+
+def k_subset_iter(k, in_set):
+ '''
+ Returns an iterator over all possible subsets x of in_set where x
+ contains exactly k items.
+ '''
+ def k_subset_iter_helper(k, in_set, base_set):
+ if len(base_set) == k:
+ yield base_set
+ else:
+ remaining_concepts = in_set.difference(base_set)
+ if len(remaining_concepts) == 0:
+ return
+
+ cur_concept = remaining_concepts.pop()
+ # Return all subsets without this element
+ for subset in k_subset_iter_helper(k, remaining_concepts, base_set):
+ yield subset
+
+ # Return all subsets with this element
+ new_set = copy.copy(base_set)
+ new_set.add(cur_concept)
+ for subset in k_subset_iter_helper(k, remaining_concepts, new_set):
+ yield subset
+
+ return k_subset_iter_helper(k, set(in_set), set())
+
+def enumerate_orders(in_set):
+ '''
+ Returns an iterator over all permutations (orderings) of in_set
+ '''
+ def helper(in_set):
+ if len(in_set) == 0:
+ yield []
+ else:
+ for item in in_set:
+ for remainder in helper(in_set - set([item])):
+ yield [item] + remainder
+
+ for order in helper(in_set):
+ yield tuple(order)
+
+def k_edge_subgraphs(graph, min_subgraph_edges, num_subgraph_vertices, logging_interval=None):
+ '''
+ Bottom-up dynamic programming approach to finding all subgraphs of
+ graph with exactly num_subgraph_vertices vertices and at least
+ min_subgraph_edges edges. This algorithm treats graph as if it
+ were untyped and undirected, so there is at most 1 edge between 2
+ vertices.
+
+ graph - the graph from which subgraphs should be extracted
+ num_subgraph_vertices - The number of vertices in the subgraphs
+ min_subgraph_edges - The minimum number of edges in each returned subgraph. min_subgraph_edges must be at least num_subgraph_vertices-1 (so the graph is connected)
+ '''
+ assert(min_subgraph_edges >= num_subgraph_vertices-1)
+
+ found_subgraphs = defaultdict(set)
+
+ ''' Base case: the set of 2 vertex subgraphs connected by 1 edge '''
+ found_subgraphs[(2, 1)] = []
+ for v1, v2, type in graph.edges():
+ found_subgraphs[(2, 1)].append(frozenset([v1, v2]))
+
+ logging_counter = 0
+ for num_nodes in xrange(3,num_subgraph_vertices + 1):
+ logging.debug("Finding graphs with %d vertices...", num_nodes)
+ for i in xrange(num_nodes - 2, min(min_subgraph_edges, (num_nodes - 2)*(num_nodes - 1)/2 + 1)):
+ ''' Fill in found_subgraphs[num_nodes, i] '''
+ logging.debug("Checking subgraphs with %d vertices and %d edges (total: %d)...",
+ num_nodes - 1, i, len(found_subgraphs[num_nodes - 1, i]))
+ for soln_subgraph in found_subgraphs[num_nodes - 1, i]:
+ ''' Compute edge counts for vertices adjacent to soln_subgraph '''
+ vertex_counts = defaultdict(lambda: 0)
+ for soln_vertex in soln_subgraph:
+ for v in graph.get_neighbors(soln_vertex):
+ if v not in soln_subgraph:
+ vertex_counts[v] += 1
+
+ for v, count in vertex_counts.iteritems():
+ found_subgraphs[(num_nodes, count + i)].add(soln_subgraph.union(frozenset([v])))
+
+ if logging_interval is not None and logging_counter % logging_interval == 0:
+ logging.debug("Adding vertices to %r ...", soln_subgraph)
+ logging_counter += 1
+
+ return found_subgraphs
+
=== added file 'csc/divisi/semantic_network.py'
--- csc/divisi/semantic_network.py 1970-01-01 00:00:00 +0000
+++ csc/divisi/semantic_network.py 2009-05-28 18:16:29 +0000
@@ -0,0 +1,209 @@
+from collections import defaultdict
+import copy
+
+class SemanticNetwork(object):
+ '''
+ This class represents a directed graph with typed edges. Vertices
+ are represented by hashable objects, and edges are (source vertex,
+ target vertex, <type>) tuples. There can be many different edges
+ with the same type, but there cannot be multiple vertices with the
+ same value.
+
+ SemanticNetworks are mutable.
+ '''
+ def __init__(self):
+ self._vertices = set()
+ self._edges = set()
+
+ self._forward_edges = defaultdict(set)
+ self._reverse_edges = defaultdict(set)
+ self._connecting_edges = defaultdict(set)
+
+ self._edge_types = set()
+
+ def vertices(self):
+ return frozenset(self._vertices)
+
+ def edges(self):
+ '''
+ Returns all edges in this graph as a set of (source, target, value) tuples
+ '''
+ return frozenset(self._edges)
+
+ def edge_types(self):
+ return frozenset(self._edge_types)
+
+ def add_vertex(self, v):
+ self._vertices.add(v)
+
+ def add_edge(self, src, dst, value):
+ edge = (src, dst, value)
+ self.add_vertex(src)
+ self.add_vertex(dst)
+ self._edges.add(edge)
+ self._edge_types.add(value)
+
+ self._forward_edges[src].add(edge)
+ self._reverse_edges[dst].add(edge)
+ self._connecting_edges[(src, dst)].add(value)
+
+ def del_edge(self, edge):
+ if edge in self._edges:
+ src, dst, val = edge
+ self._edges.remove(edge)
+ self._forward_edges[src].remove(edge)
+ self._reverse_edges[dst].remove(edge)
+ self._connecting_edges[(src, dst)].remove(val)
+
+ def get_out_edges(self, src, reltype=None):
+ '''
+ Returns the set of outbound edges as (src, <target>, <value>) tuples
+ '''
+ if reltype is None:
+ return self._forward_edges.get(src, set())
+ else:
+ return set([edge for edge in self._forward_edges.get(src, set()) if edge[2] == reltype])
+
+ def get_in_edges(self, dst, reltype=None):
+ '''
+ Returns the set of inbound edges as (<source>, dst, <value>) tuples
+ '''
+ if reltype is None:
+ return self._reverse_edges.get(dst, set())
+ else:
+ return set([edge for edge in self._reverse_edges.get(dst, set()) if edge[2] == reltype])
+
+ def get_neighbors(self, vertex):
+ out_neighbors = frozenset([t for s, t, v in self.get_out_edges(vertex)])
+ in_neighbors = frozenset([s for s, t, v in self.get_in_edges(vertex)])
+
+ return out_neighbors.union(in_neighbors)
+
+ def get_reachable_vertices(self, src, max_depth=None):
+ '''
+ Returns the set of all vertices that can be reached by following
+ directed edges out of src.
+ '''
+ found_vertices = set()
+
+ def bfs(graph, v, depth):
+ found_vertices.add(v)
+ if max_depth is not None and depth == max_depth:
+ return
+ else:
+ for _, next_v, _ in graph.get_out_edges(v):
+ bfs(graph, next_v, depth + 1)
+ bfs(self, src, 0)
+
+ return frozenset(found_vertices)
+
+ def topological_sort(self, start):
+ done_order = []
+
+ def dfs_traverse(graph, v):
+ for _, next_v, _ in graph.get_out_edges(v):
+ dfs_traverse(graph, next_v)
+ done_order.append(v)
+
+ dfs_traverse(self, start)
+ return reversed(done_order)
+
+ def get_connecting_edge_types(self, src, dst):
+ return self._connecting_edges[(src, dst)]
+
+ def subgraph_from_vertices(self, vertices):
+ g = SemanticNetwork()
+
+ for v in vertices:
+ g.add_vertex(v)
+ for source, target, value in self.get_out_edges(v):
+ if target in vertices:
+ g.add_edge(source, target, value)
+
+ return g
+
+ def enumerate_without_repeated_edges(self):
+ '''
+ Enumerates graphs h containing a subset of the edges of this
+ graph g s.t. (u, v, r) in g => ((u, v, x) or (v, u, x)) in h
+ (that is, all vertices connected by one or more relationships
+ in g are connected by exactly one of those relationships in h)
+ '''
+ if len(self.edges()) == 0:
+ yield copy.deepcopy(self)
+ else:
+ g = copy.deepcopy(self)
+ for vertices, reltypes in self._connecting_edges.iteritems():
+ src,dst = vertices
+
+ backward_reltypes = set()
+ if (dst, src) in self._connecting_edges:
+ backward_reltypes = self._connecting_edges[(dst, src)]
+
+ # This means there actually aren't any edges between
+ # the two vertices
+ if len(reltypes) + len(backward_reltypes) == 0:
+ continue
+
+ for rel in reltypes:
+ g.del_edge((src, dst, rel))
+ for rel in backward_reltypes:
+ g.del_edge((dst, src, rel))
+
+ for graph_portion in g.enumerate_without_repeated_edges():
+ yield graph_portion
+
+ for rel in reltypes:
+ h = copy.deepcopy(graph_portion)
+ h.add_edge(src, dst, rel)
+ yield h
+
+ for rel in backward_reltypes:
+ h = copy.deepcopy(graph_portion)
+ h.add_edge(dst, src, rel)
+ yield h
+ break
+
+ def __repr__(self):
+ return "[SemanticNetwork:\nVertices: %r \nEdges: %r\n]" % (self.vertices(), self.edges())
+
+if __name__ == "__main__":
+ g = SemanticNetwork()
+ g2 = SemanticNetwork()
+
+ g.add_vertex("v1")
+ g.add_edge("v1", "v2", 4)
+ g.add_edge("v2", "v3", 5)
+
+ g2.add_vertex("v1")
+ g2.add_edge("v1", "v2", 4)
+ g2.add_edge("v2", "v3", 5)
+
+ print g2.topological_sort("v1")
+
+ f = g.fingerprint()
+ h = g.fingerprint()
+ assert (f == h)
+ assert (g.fingerprint() == g2.fingerprint())
+ assert (hash(f) == hash(h))
+
+ reachable = g2.get_reachable_vertices("v1")
+ assert(reachable == frozenset(["v2", "v1", "v3"]))
+ reachable = g2.get_reachable_vertices("v1", max_depth=1)
+ assert(reachable == frozenset(["v2", "v1"]))
+
+ assert(g.vertices() == set(["v1", "v2", "v3"]))
+ assert(g.edges() == set([("v1", "v2", 4), ("v2", "v3", 5)]))
+
+ g = SemanticNetwork()
+ g.add_edge('a', 'b', 1)
+ g.add_edge('a', 'b', 2)
+ g.add_edge('a', 'b', 3)
+ g.add_edge('b', 'c', 1)
+ g.add_edge('b', 'c', 2)
+ g.add_edge('a', 'c', 1)
+ g.add_vertex('d')
+
+ for h in g.enumerate_without_repeated_edges():
+ print h
+
=== modified file 'csc/divisi/svd.py'
--- csc/divisi/svd.py 2009-05-27 04:31:34 +0000
+++ csc/divisi/svd.py 2009-05-28 18:05:43 +0000
@@ -174,7 +174,7 @@
ranges from -1 to 1.
'''
u = self.weighted_u
- angles = self.u_distances_(vec)
+ angles = self.u_dotproducts_with(vec)
vec_magnitude = sqrt(vec*vec)
# Normalize distances to get the cos(angles)
for key, value in angles.iteritems():
Follow ups