Twitter数据清理

时间:2018-12-02 07:54:50

标签: python json nlp

我正在尝试使用tweepy处理作为JSON文件收集的推文。现在,我尝试使用以下代码处理文本。 [The Chicago.json包含带有关键字Chicago的推文]

# import json library to analysis tweet text
import json
# import re library to use regular expression

import nltk
nltk.download('punkt')

import re
from nltk.tokenize import word_tokenize


# parse the whole json file
with open('stream_Chicago.json', 'r') as f:
# read only first tweet
line = f.readline()
# load ti as python dictionary
tweet = json.loads(line)
# print to view tweets data structure
# print(json.dumps(tweet, indent = 4))


tweet = 'RT @marcobonzanini: just an example! :D http://example.com #NLP'
print(word_tokenize(tweet))

emoticons_str = r"""
(?:
    [:=;] # Eyes
    [oO\-]? # Nose (optional)
    [D\)\]\(\]/\\OpP] # Mouth
)"""

regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # 
URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | 
re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

def tokenize(s):
return tokens_re.findall(s)

def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
    tokens = [token if emoticon_re.search(token) else token.lower() for 
token in tokens]
return tokens

tweet = "RT @marcobonzanini: just an example! :D http://example.com #NLP"
print(preprocess(tweet))

with open('stream_Chicago.json', 'r') as f:
for line in f:
    tweet = json.loads(line)
    print(tweet['text'])
    #print(tokens)

但是我遇到以下错误:

enter image description here

请帮助我如何解决此错误

0 个答案:

没有答案