ngrams.py
2.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
__author__ = 'cris'
from itertools import islice
'''
Returns a sliding window (of width n) over data from the iterable
s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...
'''
def is_hashtag(word):
if word.startswith('#'):
return True
return False
def is_url(word):
if word.startswith('http'):
return True
return False
def is_mention(word):
if word.startswith('@'):
return True
return False
def is_url_or_mention(word):
if word.startswith('http') or word.startswith('@'):
return True
return False
def contains_hashtag(iterable):
for elem in iterable:
if elem.startswith('#'):
return True
return False
def contains_mention(iterable):
for elem in iterable:
if elem.startswith('@'):
return True
return False
def contains_url(iterable):
for elem in iterable:
if elem.startswith('http'):
return True
return False
def contains_urls_mentions(iterable):
for elem in iterable:
if elem.startswith('http') or elem.startswith("@"):
return True
return False
def contains_non_words(iterable):
for elem in iterable:
if elem.startswith('http') or elem.startswith('#') or elem.startswith("@"):
return True
return False
def window(seq, n=2):
it = iter(seq)
result = tuple(islice(it, n))
if len(result) == n:
yield u' '.join(result)
for elem in it:
result = result[1:] + (elem,)
yield u' '.join(result)
def window_no_twitter_elems(seq, n=2):
it = iter(seq)
result = tuple(islice(it, n))
if (len(result) == n) and (not contains_non_words(result)):
yield u' '.join(result)
for elem in it:
result = result[1:] + (elem,)
if not contains_non_words(result):
yield u' '.join(result)
def window_no_hashtags(seq, n=2):
it = iter(seq)
result = tuple(islice(it, n))
if (len(result) == n) and ( not contains_hashtag(result)):
yield u' '.join(result)
for elem in it:
result = result[1:] + (elem,)
if not contains_hashtag(result):
yield u' '.join(result)
if __name__ == '__main__':
tweetsAsTokens = "this is @is #a tag test apple tree http://fu.com".split()
print(tweetsAsTokens)
# print contains_url(tweetsAsTokens)
# for i in tweetsAsTokens:
# print is_url(i)
for i in window(tweetsAsTokens,3):
print i