| """ |
| Needle in a haystack search |
| |
| Original source code is located here: |
| https://github.com/agapow/py-gsp/blob/master/gsp/motifsearch.py |
| """ |
|
|
| """ |
| A modifiable GSP algorithm. |
| """ |
|
|
| __version__ = '0.1' |
|
|
|
|
| |
|
|
| |
|
|
| PP_INDENT = 3 |
|
|
|
|
| |
|
|
| class GspSearch (object): |
| """ |
| A generic GSP algorithm, alllowing the individual parts to be overridden. |
| |
| This is setup so the object can be created once, but searched multiple times |
| at different thresholds. In this generic form, we assume that the transactions |
| are simply strings. |
| """ |
| |
| def __init__ (self, raw_transactions): |
| """ |
| C'tor, simply shaping the raw transactions into a useful form. |
| """ |
| self.process_transactions (raw_transactions) |
| |
| def process_transactions (self, raw_transactions): |
| """ |
| Create the alphabet & (normalized) transactions. |
| """ |
| self.transactions = [] |
| alpha = {} |
| for r in raw_transactions: |
| for c in r: |
| alpha[c] = True |
| self.transactions.append (r) |
| self.alpha = alpha.keys() |
| |
| def generate_init_candidates (self): |
| """ |
| Make the initial set of candidate. |
| |
| Usually this would just be the alphabet. |
| """ |
| return list (self.alpha) |
| |
| def generate_new_candidates (self, freq_pat): |
| """ |
| Given existing patterns, generate a set of new patterns, one longer. |
| """ |
| old_cnt = len (freq_pat) |
| old_len = len (freq_pat[0]) |
| print ("Generating new candidates from %s %s-mers ..." % (old_cnt, old_len)) |
| |
| new_candidates = [] |
| for c in freq_pat: |
| for d in freq_pat: |
| merged_candidate = self.merge_candidates (c, d) |
| if merged_candidate and (merged_candidate not in new_candidates): |
| new_candidates.append (merged_candidate) |
| |
| |
| return new_candidates |
| |
| def merge_candidates (self, a, b): |
| if a[1:] == b[:-1]: |
| return a + b[-1:] |
| else: |
| return None |
| |
| def filter_candidates (self, trans_min): |
| """ |
| Return a list of the candidates that occur in at least the given number of transactions. |
| """ |
| filtered_candidates = [] |
| for c in self.candidates: |
| curr_cand_hits = self.single_candidate_freq (c) |
| if trans_min <= curr_cand_hits: |
| filtered_candidates.append ((c, curr_cand_hits)) |
| return filtered_candidates |
| |
| def single_candidate_freq (self, c): |
| """ |
| Return true if a candidate is found in the transactions. |
| """ |
| hits = 0 |
| for t in self.transactions: |
| if self.search_transaction (t, c): |
| hits += 1 |
| return hits |
| |
| def search_transaction (self, t, c): |
| """ |
| Does this candidate appear in this transaction? |
| """ |
| return (t.find (c) != -1) |
| |
| def search (self, threshold): |
| |
| assert (0.0 < threshold) and (threshold <= 1.0) |
| trans_cnt = len (self.transactions) |
| trans_min = trans_cnt * threshold |
| |
| print ("The number of transactions is: %s" % trans_cnt) |
| print ("The minimal support is: %s" % threshold) |
| print ("The minimal transaction support is: %s" % trans_min) |
| |
| |
| |
| self.candidates = list (self.generate_init_candidates()) |
| print ("There are %s initial candidates." % len (self.candidates)) |
| freq_patterns = [] |
| new_freq_patterns = self.filter_candidates (trans_min) |
| print ("The initial candidates have been filtered down to %s." % len (new_freq_patterns)) |
| |
| while True: |
| |
| if new_freq_patterns: |
| freq_patterns = new_freq_patterns |
| else: |
| return freq_patterns |
| |
| |
| self.candidates = self.generate_new_candidates ([x[0] for x in freq_patterns]) |
| print ("There are %s new candidates." % len (self.candidates)) |
| new_freq_patterns = self.filter_candidates (trans_min) |
| print ("The candidates have been filtered down to %s." % len (new_freq_patterns)) |
| |
| |
|
|
| __version__ = '0.1' |
|
|
| |
|
|
| NULL_SYMBOL = 'X' |
|
|
| |
|
|
| def HaystackSearch(needle, haystack): |
| """ |
| Return the index of the needle in the haystack |
| |
| Parameters: |
| needle: any iterable |
| haystack: any other iterable |
| |
| Returns: |
| the index of the start of needle or -1 if it is not found. |
| |
| Looking for a sub-list of a list is actually a tricky thing. This |
| approach uses the Boyer-Moore-Horspool algorithm. Needle and haystack |
| should be any iterable, as long as their elements are hashable. |
| Example: |
| |
| >>> find ([1, 2], [1, 1, 2]) |
| 1 |
| >>> find ((1, 2, 3), range (10)) |
| 1 |
| >>> find ('gh', 'abcdefghi') |
| 6 |
| >>> find ([2, 3], [7, 8, 9]) |
| -1 |
| """ |
| h = len (haystack) |
| n = len (needle) |
| skip = {needle[i]: n - i - 1 for i in range(n - 1)} |
| i = n - 1 |
| while i < h: |
| for j in range(n): |
| if haystack[i - j] != needle[-j - 1]: |
| i += skip.get(haystack[i], n) |
| break |
| else: |
| return i - n + 1 |
| return -1 |