Source code for semanticizest._util
from collections import Sequence
from six.moves import xrange
from six.moves.urllib.parse import quote
[docs]def ngrams_with_pos(lst, N=None):
"""Generate n-grams with indices from a list of strings.
Parameters
----------
lst : list-like of strings
N : int, optional
Maximum n-gram length, defaults to the length of `lst`.
Returns
-----
tuple (start, end, n-gram)
Tuples are start and end index in the original list `lst`,
the n-gram is the space joined string value. The n-grams are
yielded in leftmost longest order.
Raises
------
TypeError
If `N` is not an integer.
ValueError
If `N` is not at least 1.
"""
if N is None:
N = len(lst)
if not isinstance(N, int):
raise TypeError("n-gram order N should be an integer, was %s" %
type(N))
if N < 1:
raise ValueError("n-gram order N should be 1 or greater %s" % N)
join = " ".join
for start in xrange(len(lst)):
for n in xrange(1, 1 + min(N, len(lst) - start)):
yield start, start + n, join(lst[start:start + n])
[docs]def ngrams(lst, N=None):
"""Generate bare n-grams from a list of strings.
See Also
--------
ngrams_with_pos : for a description of the arguments.
"""
return (ng for _, _, ng in ngrams_with_pos(lst, N))
[docs]def tosequence(x):
"""Cast x to sequence. Returns x if at all possible."""
return x if isinstance(x, Sequence) else list(x)
[docs]def url_from_title(title, wiki):
"""Turn an article title into a Wikipedia URL.
Parameters
----------
wiki : string
Denotes the specific Wikipedia (language), e.g. "en".
"""
title = title.strip()
if not isinstance(title, bytes):
title = title.encode('utf-8')
title = title[0].upper() + title[1:] # Wikipedia-specific
title = quote(title.replace(' ', '_'), safe=',()/:')
return "https://{}.wikipedia.org/wiki/{}".format(wiki, title)