-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMethods.py
More file actions
38 lines (32 loc) · 1.14 KB
/
Copy pathMethods.py
File metadata and controls
38 lines (32 loc) · 1.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import re
from nltk.corpus import stopwords
# read Stopwords
def read_stopwords():
fp = open("SmartStoplist.txt", "r")
stopwordss = []
words = fp.readlines()
for word in words:
word = re.sub(r"\n","",word)
stopwordss.append(word)
stopwordss = set(stopwordss)
new_words = ['omg', 'u', 'lol', 'gm', 'gn', 'gd9t', 'tc', 'rt', 'oops','ok']
stopwordss = stopwordss.union(new_words)
stop_words = set(stopwords.words("english"))
stopwordss = stopwordss.union(stop_words)
return stopwordss
def text_cleaning(text,stop_words):
text = re.sub(r"http\S+", "", text) # remove url
text = re.sub(r"#", "", text)
text = re.sub("</?.*?>", " <> ", text) # greater than equal signs
text = re.sub(r"[^a-zA-Z0-9.]+", ' ', text)
text = text.split(" ")
text = [word for word in text if not word in stop_words]
text = " ".join(text)
print(text)
return text
def test_data_cleaning(text):
text = re.sub(r"http\S+", "", text) # remove url
text = re.sub(r"@\S+", "", text) # remove url
text = re.sub(r"RT","",text)
text = re.sub(r"#\S+", "", text)
return text