commonsense team mailing list archive

Thread
Date

[Merge] lp:~r-launchpad-jayantkrish-com/divisi/analogies into lp:divisi

To: Commonsense Computing <commonsense@xxxxxxxxxxxxxxxxxxx>
From: Jayant <launchpad@xxxxxxxxxxxxxxx>
Date: Thu, 28 May 2009 18:20:25 -0000
Reply-to: mp+6868@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

Jayant has proposed merging lp:~r-launchpad-jayantkrish-com/divisi/analogies into lp:divisi.

Requested reviews:
    Commonsense Computing (commonsense)

This is the code for CrossBridge analogies. crossbridge.py contains the actual analogy algorithm, and semantic_network.py contains a semantic network implementation.
-- 
https://code.launchpad.net/~r-launchpad-jayantkrish-com/divisi/analogies/+merge/6868
Your team Commonsense Computing is subscribed to branch lp:divisi.

=== added file 'csc/divisi/crossbridge.py'
--- csc/divisi/crossbridge.py	1970-01-01 00:00:00 +0000
+++ csc/divisi/crossbridge.py	2009-05-28 18:16:29 +0000
@@ -0,0 +1,318 @@
+from collections import defaultdict
+from labeled_tensor import SparseLabeledTensor
+from semantic_network import SemanticNetwork
+import copy
+import logging
+
+class CrossBridge(object):
+    '''
+    This class implements the CrossBridge analogy algorithm.
+
+    Typical usage:
+    cnet_graph = cnet_graph_from_tensor(cnet_tensor)
+    crossbridge = CrossBridge(cnet_graph) # This may compute for a bit
+    # Then one of the following
+    analogies, top_relations = crossbridge.analogy(set(['bird', 'wing', 'fly', 'sky']))
+    analogies, top_relations = crossbridge.analogy_from_concept('bird')
+    '''
+
+    def __init__(self, graph, num_nodes=3, min_graph_edges=3,
+                 min_feature_edges=2, max_feature_edges=3, svd_dims=100,
+                 logging_interval=None):
+
+        self.num_nodes = num_nodes        
+        self.graph = graph
+        self.tensor = self._build_analogy_tensor(graph, num_nodes, min_graph_edges,
+                                                 min_feature_edges, max_feature_edges,
+                                                 logging_interval=logging_interval)
+        self.svd = self.tensor.svd(k=svd_dims)
+
+
+    def _build_analogy_tensor(self, graph, num_nodes, min_graph_edges,
+                              min_feature_edges, max_feature_edges, 
+                              logging_interval=None):
+    
+        '''
+        Builds a tensor T whose rows correspond to maximal dense subgraphs of
+        graph and whose columns are relation structures (n-ary relations 
+        formed by combining multiple binary relations).
+    
+        graph - The graph from which the tensor is built
+        num_nodes - The size of each vertex set row
+        min_graph_edges - The minimum number of edges in the graphs being indexed
+        min_feature_edges - The minimum number of edges to include in a graph feature
+        max_feature_edges - The minimum number of edges to include in a graph feature
+
+        Note: the min_graph_edges parameter treats the graph as if it
+        is undirected and untyped, meaning there is at most 1 edge
+        between 2 vertices
+        '''
+        k_subgraphs = k_edge_subgraphs(graph, min_graph_edges, num_nodes, 
+                                         logging_interval=logging_interval)
+        rel_tensor = SparseLabeledTensor(ndim=2)
+    
+        logging_counter = 0
+        for size, subgraphs_vertices in k_subgraphs.iteritems():
+            if size[0] == num_nodes and size[1] >= min_graph_edges:
+                logging.debug("Adding graphs with %d vertices and %d edges (%d total)...", 
+                              size[0], size[1], len(subgraphs_vertices))
+                for subgraph_vertices in subgraphs_vertices:
+                    subgraph = graph.subgraph_from_vertices(subgraph_vertices)
+                    for subgraph_no_repeats in subgraph.enumerate_without_repeated_edges():
+                        if (len(subgraph_no_repeats.edges()) >= min_feature_edges 
+                            and len(subgraph_no_repeats.edges()) <= max_feature_edges):
+                            for vertex_order in enumerate_orders(subgraph_vertices):
+                                vm = dict([(y, x) for x, y in enumerate(vertex_order)])
+                                edges = frozenset([(vm[v1], vm[v2], value) for v1, v2, value in subgraph_no_repeats.edges()])
+                                rel_tensor[vertex_order, edges] = 1
+    
+                    if logging_interval is not None and logging_counter % logging_interval == 0:
+                        logging.debug("%r", subgraph_vertices)
+                    logging_counter += 1
+
+        return rel_tensor
+
+        
+    def analogy(self, source_concept_set, logging_interval=None,
+                num_candidates=300, beam_width=10000):
+        '''
+        Takes a set of source concepts and returns a sorted, scored
+        list of analogies for the source concepts and the most
+        important relations.
+    
+        num_candidates and beam_width are parameters that control the
+        number of candidate analogies maintained at various points during
+        the search for good analogies.
+    
+        Note: The important relations are computed by a very hacky
+        algorithm... I (jayant) don't trust them very much
+        '''
+    
+        '''
+        First search for analogies between self.num_nodes concepts.
+        '''
+        candidate_target_concept_sets = []
+        candidate_relations = defaultdict(lambda: 0)
+        for concept_triple in k_subset_iter(self.num_nodes, source_concept_set):
+            try:
+                concept_triple = tuple(concept_triple)
+                candidates = self.svd.u_angles_to(self.svd.weighted_u[concept_triple, :]).top_items(num_candidates)
+                candidate_target_concept_sets.extend([(frozenset(zip(target_triple, concept_triple)), score) for target_triple, score in candidates])
+    
+                inferred_relations = self.svd.v_angles_to(self.svd.weighted_u[concept_triple, :]).top_items(num_candidates)
+                for relations, score in inferred_relations:
+                    for c1, c2, rel in relations:
+                        candidate_relations[(concept_triple[c1], rel, concept_triple[c2])] += score
+                
+                logging.debug("Found source concept set: %r", concept_triple)
+                logging.debug("Top 10 analogies: %r", candidates[:10])
+                logging.debug("Top 10 relations: %r", inferred_relations[:10])
+            except KeyError:
+                pass
+    
+        '''
+        Iteratively merge the candidate analogies together by combining
+        n-concept analogies containing 2 overlapping concepts.
+        '''
+        smallest_analogies = copy.copy(candidate_target_concept_sets)
+        candidates = [set(candidate_target_concept_sets)]
+        iteration_num = 0
+        while len(candidates[-1]) != 0:
+            logging.debug("Starting iteration %d (%d candidates)", iteration_num, len(candidates[-1]))
+            iteration_num += 1
+    
+            new_candidates = set()
+            candidate_scores = {}
+            candidates_with_children = set()
+            for mapping, score in candidates[-1]:
+                for mapping2, score2 in smallest_analogies:
+                    if (len(mapping & mapping2) == 2 and  
+                        len(set([x for x, y in mapping]) & set([x for x, y in mapping2])) == 2 and
+                        len(set([y for x, y in mapping]) & set([y for x, y in mapping2])) == 2):
+                        new_candidate = mapping | mapping2
+                        if new_candidate in new_candidates:
+                            candidate_scores[new_candidate] = max(score + score2, candidate_scores[new_candidate])
+                        else:
+                            new_candidates.add(new_candidate)
+                            candidate_scores[new_candidate] = score + score2
+                        candidates_with_children.add((mapping, score))
+    
+            candidates[-1] = candidates[-1] - candidates_with_children
+            logging.debug("terminal candidates: %d", len(candidates[-1]))
+            new_candidates = set([(x, candidate_scores[x]) for x in new_candidates])
+    
+            if beam_width is not None:
+                ''' Prune the current batch of candidates '''
+                new_candidates = list(new_candidates)
+                new_candidates.sort(key=lambda x: x[1], reverse=True)
+                new_candidates = set(new_candidates[:beam_width])
+    
+            candidates.append(new_candidates)
+    
+        all_candidates = list(reduce(lambda x, y: y.union(x), candidates, set()))
+    
+        ''' Remove duplicate target sets '''
+        seen_targets = defaultdict(set)
+        for candidate in all_candidates:
+            seen_targets[frozenset([x for x, y in candidate[0]])].add(candidate)
+    
+        filtered_candidates = []
+        for target_candidate, base_candidates in seen_targets.iteritems():
+            filtered_candidates.append(max(base_candidates, key=lambda x:x[1]))
+    
+        filtered_candidates.sort(key=lambda x: x[1], reverse=True)
+
+        return filtered_candidates, sorted(candidate_relations.items(), key=lambda x: x[1], reverse=True)
+
+
+    def analogy_from_concept(self, source_concept, logging_interval=None, 
+                             num_candidates=100, 
+                             beam_width=None):
+        '''
+        Finds an analogy for a single concept. The algorithm simply
+        uses the CrossBridge.analogy method to find analogies for
+        neighbors of source_concept.
+
+        (This procedure is hacky because not all of the neighboring
+        concepts are important for the analogy)
+        '''
+
+        # Get the concepts that are a part of the chosen concept
+        related_concepts = set(self.graph.get_neighbors(source_concept))
+        related_concepts.add(source_concept)
+
+        logging.debug("Concept analogy: %r", source_concept)
+        logging.debug("looking up: %r", related_concepts)
+        # Find an analogy with these concepts
+        return self.analogy(related_concepts, 
+                            logging_interval=logging_interval, num_candidates=num_candidates,
+                            beam_width=beam_width)
+
+
+def cnet_graph_from_tensor(cnet, delete_reltypes=None,
+                           assertion_weight_threshold=0):
+    '''
+    This is a sort of hacky way to transform the typical 
+    ConceptNet concept / feature matrix into a SemanticNetwork.
+
+    assertion_weight_threshold - only assertions with scores greater than the threshold are included in the graph.
+    delete_reltypes - a list of relationship types that shouldn't be included in the graph
+    '''
+
+    concepts = set(cnet.dim_keys(0))
+    
+    cnet_graph = SemanticNetwork()   
+    for assertion, score in cnet.iteritems():
+        if score < assertion_weight_threshold:
+            continue
+
+        if assertion[1][0] == 'left':
+            # left feature
+            c1 = assertion[1][2]
+            c2 = assertion[0]
+            r = assertion[1][1]
+        else:
+            # right feature
+            c1 = assertion[0]
+            c2 = assertion[1][2]
+            r = assertion[1][1]
+
+        cnet_graph.add_edge(c1, c2, r)
+
+    if delete_reltypes is not None:
+        logging.debug("Removing relations: %r", delete_reltypes)
+        edges = copy.copy(cnet_graph.edges())
+        for edge in edges:
+            if edge[2] in delete_reltypes:
+                cnet_graph.del_edge(edge)
+
+    return cnet_graph
+
+''' These are utility functions used by CrossBridge '''
+
+def k_subset_iter(k, in_set):
+    '''
+    Returns an iterator over all possible subsets x of in_set where x
+    contains exactly k items.
+    '''
+    def k_subset_iter_helper(k, in_set, base_set):
+        if len(base_set) == k:
+            yield base_set
+        else:
+            remaining_concepts = in_set.difference(base_set)
+            if len(remaining_concepts) == 0:
+                return
+
+            cur_concept = remaining_concepts.pop()
+            # Return all subsets without this element
+            for subset in k_subset_iter_helper(k, remaining_concepts, base_set):
+                yield subset
+
+            # Return all subsets with this element
+            new_set = copy.copy(base_set)
+            new_set.add(cur_concept)            
+            for subset in k_subset_iter_helper(k, remaining_concepts, new_set):
+                yield subset
+
+    return k_subset_iter_helper(k, set(in_set), set())
+
+def enumerate_orders(in_set):
+    ''' 
+    Returns an iterator over all permutations (orderings) of in_set
+    '''
+    def helper(in_set):
+        if len(in_set) == 0:
+            yield []
+        else:
+            for item in in_set:
+                for remainder in helper(in_set - set([item])):
+                    yield [item] + remainder
+
+    for order in helper(in_set):
+        yield tuple(order)
+
+def k_edge_subgraphs(graph, min_subgraph_edges, num_subgraph_vertices, logging_interval=None):
+    '''
+    Bottom-up dynamic programming approach to finding all subgraphs of
+    graph with exactly num_subgraph_vertices vertices and at least
+    min_subgraph_edges edges.  This algorithm treats graph as if it
+    were untyped and undirected, so there is at most 1 edge between 2
+    vertices.
+    
+    graph - the graph from which subgraphs should be extracted
+    num_subgraph_vertices - The number of vertices in the subgraphs
+    min_subgraph_edges - The minimum number of edges in each returned subgraph. min_subgraph_edges must be at least num_subgraph_vertices-1 (so the graph is connected)
+    '''
+    assert(min_subgraph_edges >= num_subgraph_vertices-1)
+
+    found_subgraphs = defaultdict(set)
+
+    ''' Base case: the set of 2 vertex subgraphs connected by 1 edge '''
+    found_subgraphs[(2, 1)] = []
+    for v1, v2, type in graph.edges():
+        found_subgraphs[(2, 1)].append(frozenset([v1, v2]))
+
+    logging_counter = 0
+    for num_nodes in xrange(3,num_subgraph_vertices + 1):
+        logging.debug("Finding graphs with %d vertices...", num_nodes)
+        for i in xrange(num_nodes - 2, min(min_subgraph_edges, (num_nodes - 2)*(num_nodes - 1)/2 + 1)):
+            ''' Fill in found_subgraphs[num_nodes, i] '''
+            logging.debug("Checking subgraphs with %d vertices and %d edges (total: %d)...", 
+                          num_nodes - 1, i, len(found_subgraphs[num_nodes - 1, i]))
+            for soln_subgraph in found_subgraphs[num_nodes - 1, i]:
+                ''' Compute edge counts for vertices adjacent to soln_subgraph '''
+                vertex_counts = defaultdict(lambda: 0)
+                for soln_vertex in soln_subgraph:
+                    for v in graph.get_neighbors(soln_vertex):
+                        if v not in soln_subgraph:
+                            vertex_counts[v] += 1
+
+                for v, count in vertex_counts.iteritems():
+                    found_subgraphs[(num_nodes, count + i)].add(soln_subgraph.union(frozenset([v])))
+
+                if logging_interval is not None and logging_counter % logging_interval == 0:
+                    logging.debug("Adding vertices to %r ...", soln_subgraph)
+                logging_counter += 1
+
+    return found_subgraphs
+

=== added file 'csc/divisi/semantic_network.py'
--- csc/divisi/semantic_network.py	1970-01-01 00:00:00 +0000
+++ csc/divisi/semantic_network.py	2009-05-28 18:16:29 +0000
@@ -0,0 +1,209 @@
+from collections import defaultdict
+import copy
+
+class SemanticNetwork(object):   
+    '''
+    This class represents a directed graph with typed edges. Vertices
+    are represented by hashable objects, and edges are (source vertex,
+    target vertex, <type>) tuples. There can be many different edges
+    with the same type, but there cannot be multiple vertices with the
+    same value.
+
+    SemanticNetworks are mutable.
+    '''
+    def __init__(self):
+        self._vertices = set()
+        self._edges = set()
+
+        self._forward_edges = defaultdict(set)
+        self._reverse_edges = defaultdict(set)
+        self._connecting_edges = defaultdict(set)
+
+        self._edge_types = set()
+
+    def vertices(self):
+        return frozenset(self._vertices)
+
+    def edges(self):
+        '''
+        Returns all edges in this graph as a set of (source, target, value) tuples
+        '''
+        return frozenset(self._edges)
+
+    def edge_types(self):
+        return frozenset(self._edge_types)
+
+    def add_vertex(self, v):
+        self._vertices.add(v)
+
+    def add_edge(self, src, dst, value):
+        edge = (src, dst, value)
+        self.add_vertex(src)
+        self.add_vertex(dst)
+        self._edges.add(edge)
+        self._edge_types.add(value)
+
+        self._forward_edges[src].add(edge)
+        self._reverse_edges[dst].add(edge)
+        self._connecting_edges[(src, dst)].add(value)
+
+    def del_edge(self, edge):
+        if edge in self._edges:
+            src, dst, val = edge
+            self._edges.remove(edge)
+            self._forward_edges[src].remove(edge)
+            self._reverse_edges[dst].remove(edge)
+            self._connecting_edges[(src, dst)].remove(val)
+
+    def get_out_edges(self, src, reltype=None):
+        '''
+        Returns the set of outbound edges as (src, <target>, <value>) tuples
+        '''
+        if reltype is None:
+            return self._forward_edges.get(src, set())
+        else:
+            return set([edge for edge in self._forward_edges.get(src, set()) if edge[2] == reltype])
+
+    def get_in_edges(self, dst, reltype=None):
+        '''
+        Returns the set of inbound edges as (<source>, dst, <value>) tuples
+        '''
+        if reltype is None:
+            return self._reverse_edges.get(dst, set())
+        else:
+            return set([edge for edge in self._reverse_edges.get(dst, set()) if edge[2] == reltype])
+
+    def get_neighbors(self, vertex):
+        out_neighbors = frozenset([t for s, t, v in self.get_out_edges(vertex)])
+        in_neighbors = frozenset([s for s, t, v in self.get_in_edges(vertex)])
+
+        return out_neighbors.union(in_neighbors)
+
+    def get_reachable_vertices(self, src, max_depth=None):
+        '''
+        Returns the set of all vertices that can be reached by following 
+        directed edges out of src.
+        '''
+        found_vertices = set()
+
+        def bfs(graph, v, depth):
+            found_vertices.add(v)
+            if max_depth is not None and depth == max_depth:
+                return 
+            else:
+                for _, next_v, _ in graph.get_out_edges(v):
+                    bfs(graph, next_v, depth + 1)
+        bfs(self, src, 0)
+
+        return frozenset(found_vertices)
+
+    def topological_sort(self, start):
+        done_order = []
+
+        def dfs_traverse(graph, v):
+            for _, next_v, _ in graph.get_out_edges(v):
+                dfs_traverse(graph, next_v)
+            done_order.append(v)
+
+        dfs_traverse(self, start)
+        return reversed(done_order)
+
+    def get_connecting_edge_types(self, src, dst):
+        return self._connecting_edges[(src, dst)]
+
+    def subgraph_from_vertices(self, vertices):
+        g = SemanticNetwork()
+
+        for v in vertices:
+            g.add_vertex(v)
+            for source, target, value in self.get_out_edges(v):
+                if target in vertices:
+                    g.add_edge(source, target, value)
+
+        return g        
+
+    def enumerate_without_repeated_edges(self):
+        '''
+        Enumerates graphs h containing a subset of the edges of this
+        graph g s.t. (u, v, r) in g => ((u, v, x) or (v, u, x)) in h
+        (that is, all vertices connected by one or more relationships
+        in g are connected by exactly one of those relationships in h)
+        '''
+        if len(self.edges()) == 0:
+            yield copy.deepcopy(self)
+        else:
+            g = copy.deepcopy(self)
+            for vertices, reltypes in self._connecting_edges.iteritems():
+                src,dst = vertices
+                  
+                backward_reltypes = set()
+                if (dst, src) in self._connecting_edges:
+                    backward_reltypes = self._connecting_edges[(dst, src)]
+
+                # This means there actually aren't any edges between
+                # the two vertices
+                if len(reltypes) + len(backward_reltypes) == 0:
+                    continue
+
+                for rel in reltypes:
+                    g.del_edge((src, dst, rel))
+                for rel in backward_reltypes:
+                    g.del_edge((dst, src, rel))
+    
+                for graph_portion in g.enumerate_without_repeated_edges():
+                    yield graph_portion
+
+                    for rel in reltypes:
+                        h = copy.deepcopy(graph_portion)
+                        h.add_edge(src, dst, rel)
+                        yield h
+    
+                    for rel in backward_reltypes:
+                        h = copy.deepcopy(graph_portion)
+                        h.add_edge(dst, src, rel)
+                        yield h
+                break
+    
+    def __repr__(self):
+        return "[SemanticNetwork:\nVertices: %r \nEdges: %r\n]" % (self.vertices(), self.edges())
+
+if __name__ == "__main__":
+    g = SemanticNetwork()
+    g2 = SemanticNetwork()
+
+    g.add_vertex("v1")
+    g.add_edge("v1", "v2", 4)
+    g.add_edge("v2", "v3", 5)
+
+    g2.add_vertex("v1")
+    g2.add_edge("v1", "v2", 4)
+    g2.add_edge("v2", "v3", 5)
+
+    print g2.topological_sort("v1")
+
+    f = g.fingerprint()
+    h = g.fingerprint()
+    assert (f == h)
+    assert (g.fingerprint() == g2.fingerprint())
+    assert (hash(f) == hash(h))
+
+    reachable = g2.get_reachable_vertices("v1")
+    assert(reachable == frozenset(["v2", "v1", "v3"]))
+    reachable = g2.get_reachable_vertices("v1", max_depth=1)
+    assert(reachable == frozenset(["v2", "v1"]))
+
+    assert(g.vertices() == set(["v1", "v2", "v3"]))
+    assert(g.edges() == set([("v1", "v2", 4), ("v2", "v3", 5)]))
+
+    g = SemanticNetwork()
+    g.add_edge('a', 'b', 1)
+    g.add_edge('a', 'b', 2)
+    g.add_edge('a', 'b', 3)
+    g.add_edge('b', 'c', 1)
+    g.add_edge('b', 'c', 2)
+    g.add_edge('a', 'c', 1)
+    g.add_vertex('d')
+
+    for h in g.enumerate_without_repeated_edges():
+        print h
+

=== modified file 'csc/divisi/svd.py'
--- csc/divisi/svd.py	2009-05-27 04:31:34 +0000
+++ csc/divisi/svd.py	2009-05-28 18:05:43 +0000
@@ -174,7 +174,7 @@
         ranges from -1 to 1.
         '''
         u = self.weighted_u
-        angles = self.u_distances_(vec)
+        angles = self.u_dotproducts_with(vec)
         vec_magnitude = sqrt(vec*vec)
         # Normalize distances to get the cos(angles)
         for key, value in angles.iteritems():

Follow ups

Re: [Merge] lp:~r-launchpad-jayantkrish-com/divisi/analogies into lp:divisi
From: Ken Arnold, 2009-05-28