Tokenizing and Removing Stopwords from JSON using nltk - python

Hi I keep getting this error:
D:\WinPython-32bit-2.7.10.3\python-2.7.10>python TweetTest.py Twitter.json
Traceback (most recent call last):
File "TweetTest.py", line 60, in <module>
tweet = json.loads(line)
File "D:\WinPython-32bit-2.7.10.3\python-2.7.10\lib\json\__init__.py", line 338, in loads
return _default_decoder.decode(s)
File "D:\WinPython-32bit-2.7.10.3\python-2.7.10\lib\json\decoder.py", line 369, in decode
raise ValueError(errmsg("Extra data", s, end, len(s)))
ValueError: Extra data: line 1 column 4488 - line 1 column 99678411 (char 4487 - 99678410)
I have no idea what is wrong. My code is as follows:
import sys
import json
from collections import Counter
import re
from nltk.corpus import stopwords
import string
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:#[\w_]+)', # #-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
if __name__ == '__main__':
fname = sys.argv[1]
with open(fname, 'r') as f:
count_all = Counter()
for line in f:
tweet = json.loads(line)
tokens = preprocess(tweet['text'])
count_all.update(tokens)
print(count_all.most_common(5))
This is the first two output of my JSON file. I have used a Tweet Stream listener to collect the tweets.
{"created_at":"Wed Apr 06 08:33:55 +0000 2016","id":717631408345333760,"id_str":"717631408345333760","text":"RT #whosharold: Hilary Clinton cannot be president pls she can't even hold her man down what makes ya think she gon hold the office down","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":472387071,"id_str":"472387071","name":"BigGucciK 2x","screen_name":"KaisonThatBoy","location":"Bridgeport, CT","url":null,"description":null,"protected":false,"verified":false,"followers_count":1608,"friends_count":1219,"listed_count":8,"favourites_count":1293,"statuses_count":64337,"created_at":"Mon Jan 23 22:07:27 +0000 2012","utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/709500377104818182\/4vMu066C_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/709500377104818182\/4vMu066C_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/472387071\/1457000395","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Apr 06 03:16:15 +0000 2016","id":717551464575401984,"id_str":"717551464575401984","text":"Hilary Clinton cannot be president pls she can't even hold her man down what makes ya think she gon hold the office down","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":792436550,"id_str":"792436550","name":"sadboyz","screen_name":"whosharold","location":null,"url":null,"description":"platano maduro no vuelve a verde","protected":false,"verified":false,"followers_count":1285,"friends_count":979,"listed_count":11,"favourites_count":4877,"statuses_count":91425,"created_at":"Thu Aug 30 21:26:30 +0000 2012","utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/773304539\/94dbc3d1558da7f1e3d2c6fffcb5d710.jpeg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/773304539\/94dbc3d1558da7f1e3d2c6fffcb5d710.jpeg","profile_background_tile":true,"profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/714669878012219392\/9HmilvPG_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/714669878012219392\/9HmilvPG_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/792436550\/1458855437","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":2,"favorite_count":7,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"whosharold","name":"sadboyz","id":792436550,"id_str":"792436550","indices":[3,14]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1459931635353"}
{"created_at":"Wed Apr 06 08:33:55 +0000 2016","id":717631409742020609,"id_str":"717631409742020609","text":"RT #WisegalGranny: HONY Just Destroyed Donald Trump\u2019s Dream Of Becoming President - https:\/\/t.co\/8GIDVa76bZ Oooo, that's gonna hurt! #Unite\u2026","source":"\u003ca href=\"https:\/\/roundteam.co\" rel=\"nofollow\"\u003eRoundTeam\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":2846552432,"id_str":"2846552432","name":"Glenn Silva","screen_name":"GlennSilva76","location":"hawaii","url":null,"description":"Christian, Constitutional Conservative, Pro 1A 2A and RF, It's Time To Unite And Take Our Country Back! #NeverTrump\r\n#UniteWithCruz #CruzCrew #CruzToVictory","protected":false,"verified":false,"followers_count":1981,"friends_count":2408,"listed_count":99,"favourites_count":1819,"statuses_count":38301,"created_at":"Wed Oct 08 07:34:50 +0000 2014","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/691834454868889601\/1gkIbY1C_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/691834454868889601\/1gkIbY1C_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/2846552432\/1453447926","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Apr 06 08:18:04 +0000 2016","id":717627418454966272,"id_str":"717627418454966272","text":"HONY Just Destroyed Donald Trump\u2019s Dream Of Becoming President - https:\/\/t.co\/8GIDVa76bZ Oooo, that's gonna hurt! #UniteWithCruz #NeverTrump","source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":4726275950,"id_str":"4726275950","name":"Wisegal1958","screen_name":"WisegalGranny","location":null,"url":null,"description":null,"protected":false,"verified":false,"followers_count":475,"friends_count":290,"listed_count":73,"favourites_count":8976,"statuses_count":10881,"created_at":"Fri Jan 08 02:36:28 +0000 2016","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"F5F8FA","profile_background_image_url":"","profile_background_image_url_https":"","profile_background_tile":false,"profile_link_color":"2B7BB9","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/715082668770242561\/ohjXvK85_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/715082668770242561\/ohjXvK85_normal.jpg","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":1,"favorite_count":0,"entities":{"hashtags":[{"text":"UniteWithCruz","indices":[114,128]},{"text":"NeverTrump","indices":[129,140]}],"urls":[{"url":"https:\/\/t.co\/8GIDVa76bZ","expanded_url":"http:\/\/www.parhlo.com\/hony-just-destroyed-trumps-dream-of-becoming-president\/?track=twb","display_url":"parhlo.com\/hony-just-dest\u2026","indices":[65,88]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"UniteWithCruz","indices":[133,140]},{"text":"NeverTrump","indices":[139,140]}],"urls":[{"url":"https:\/\/t.co\/8GIDVa76bZ","expanded_url":"http:\/\/www.parhlo.com\/hony-just-destroyed-trumps-dream-of-becoming-president\/?track=twb","display_url":"parhlo.com\/hony-just-dest\u2026","indices":[84,107]}],"user_mentions":[{"screen_name":"WisegalGranny","name":"Wisegal1958","id":4726275950,"id_str":"4726275950","indices":[3,17]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1459931635686"}
Please help me. Thank you.

I had the same error once.
Your script loads a JSON object at each line read, the issue might be that your JSON objects are not separated by a newline.
For instance if your file contains
json_oject1
json_oject2
then the two objects will be read whereas if the file contains
json_oject1 json_oject2
you will get an error.
Solution: add a newline when writing a new JSON object to the output file.
(related: https://stackoverflow.com/a/21058946/2314737)

Related

Spark streaming from a File source

i have two notebooks : streaming.ipynb and File.ipynb (tools: spark3.2.1, colab notebook)
the streaming.ipynb:
import sys
from textblob import TextBlob
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
import string
import re
#Emoji patterns
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
# define stopwords
nltk.download('stopwords')
stopword = stopwords.words('english')
# import stemmer from nltk
ps = nltk.PorterStemmer()
# import lemmatizer from nltk
#wn = nltk.WordNetLemmatizer()
# import lemmatizer from nltk
import nltk
from nltk.stem import WordNetLemmatizer
wn = nltk.download('wordnet') #WordNetLemmatizer()
def abb(tweet):
tweet = re.sub(r"im|i'm|iam", "i am", tweet)
tweet = re.sub(r"he's", "he is", tweet)
tweet = re.sub(r"she's", "she is", tweet)
tweet = re.sub(r"that's", "that is", tweet)
tweet = re.sub(r"there's", "there is", tweet)
tweet = re.sub(r"what's", "what is", tweet)
tweet = re.sub(r"where's", "where is", tweet)
tweet = re.sub(r"\'ll", " will", tweet)
tweet = re.sub(r"\'ve", " have", tweet)
tweet = re.sub(r"\'re", " are", tweet)
tweet = re.sub(r"\'d", " would", tweet)
tweet = re.sub(r"\'ve", " have", tweet)
tweet = re.sub(r"won't", "will not", tweet)
tweet = re.sub(r"don't", "do not", tweet)
tweet = re.sub(r"did't", "did not", tweet)
tweet = re.sub(r"can't", "can not", tweet)
tweet = re.sub(r"it's", "it is", tweet)
tweet = re.sub(r"couldn't", "could not", tweet)
tweet = re.sub(r"wouldn't", "would not", tweet)
tweet = re.sub(r"have't", "have not", tweet)
return tweet
def getSubjectivity(text):
return TextBlob(text).sentiment.subjectivity
#Create a function to get the polarity
def getPolarity(text):
return TextBlob(text).sentiment.polarity
def getAnalysis(score):
if score < 0:
return "Negative"
elif score == 0:
return "Neutral"
else:
return "Positive"
def main():
sc = SparkContext(appName="PysparkStreaming")
ssc = StreamingContext(sc, 5) #Streaming will execute in each 3 seconds
lines = ssc.textFileStream('/content/drive/MyDrive/Colab Notebooks/log/') #'log/ mean directory name
counts = lines.flatMap(lambda line: line.split("")) \
.map(lambda x: x.lower())\
.map(lambda x: re.sub(r"#[A-Za-z0-9]+", "", x))\
.map(lambda x: re.sub(r"\\x[A-Za-z][0-9]+", "", x))\
.map(lambda x: re.sub(r"\\u[0-9][A-Za-z]|\\U[0-9][A-Za-z]+", "", x))\
.map(lambda x: re.sub(r'&amp;', '&', x)) \
.map(lambda x: re.sub(r"b[\s]+", "", x)) \
.map(lambda x: re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", x)) \
.map(lambda x: abb(x)) \
.map(lambda x: re.sub("[^a-zA-Z0-9\s]+", "",x)) \
.map(lambda x: re.sub('[0-9]+', ' ', x))\
.map(lambda x: emoji_pattern.sub(r'', x) )\
.map(lambda x: "".join([char for char in x if char not in string.punctuation])) \
.map(lambda x: re.split('\W+', x)) \
.map(lambda x: " ".join(x))\
.map(lambda x: [word for word in x if word not in stopword])\
.map(lambda x: getPolarity(x))
counts.pprint()
ssc.start()
ssc.awaitTermination()
if __name__ == "__main__":
main()
and File.ipynb:
from random import randint
import time
"""
This is use for create 30 file one by one in each 5 seconds interval.
These files will store content dynamically from 'lorem.txt' using below code
"""
def main():
a = 1
with open('/content/drive/MyDrive/Colab Notebooks/lorem.txt', 'r',encoding='latin-1') as file: # reading content from 'lorem.txt' file
lines = file.readlines()
while a <= 30:
totalline = len(lines)
linenumber = randint(0, totalline - 10)
with open('/content/drive/MyDrive/Colab Notebooks/log/log.txt'.format(a), 'w') as writefile:
writefile.write(' '.join(line for line in lines[linenumber:totalline]))
print('creating file log{}.txt'.format(a))
a += 1
time.sleep(5)
if __name__ == '__main__':
main()
Notes:
in File.ipynb, i read lorem.txt which contains tweets and separate data in log.txt which is NULL but when you run the code it will be create log{1} contains a part of tweets, log{2} also and so on.... and streaming.ipynb stream from those logs each 5 seconds print the result, lorem.txt contains:
'A woman faces 100 lashes and up to seven years in prison because she was a VICTIM of RAPE in Qatar 🇶🇦\n\nShe was told she could possibly avoid criminal charges if she married 💍her attacker\n\n#FIFA22 #WorldCup2022 #Qatar2022 #Qatar'
'#WorldCup2022 '
'i love my mother'
'it is so bad!'
'Black Stars technical team among 32 for 2022 Aspire Academy Global Summit\n\n- \n\n.#Ghana #WorldCup2022 '
'🔊New Episode 163 - The Business End out now!!! #WorldCup2022 #TorontoFC #RealMadridChelsea #LiverpoolVillarreal #ManCityRealMadrid #ARSMUN #ACMILAN #ChampionsLeague #uefachampionsleague \nOut now on all popular audio podcast platforms🎶 '
'Check out new work on my #Behance profile: "Edited" \n#edit #photoshop #photography #road #street #art #design #architect #amazon #quote #fun #amazing #EidMubarak #ukraine #putin #gold #oil #WorldCup2022'
'Can we talk about how bad the Al Rihla looks #WorldCup2022 '
'They should hire Mourinho as a consultant for the #WorldCup2022'
'#yuzi_chahal #imkuldeep18 The spin duo back with bang bang performance for representing franchise Eagerly Waiting to see in India team behind the stump by Rishaph pant The selectors are very big problem to pick a players for #WorldCup2022\n#IPL20222'
'#DraganflyInc Draganflyer Commander can sanitize the entire 50,000 seater stadium in 4-6 hrs. The #WorldCup2022 in #Qatar2022 is slowly approaching. This drone could really help keep stadiums safe. Food for thought.\n\n#CovidIsntOver \n#GOPTaxScam \n#IMPEACHBIDENNOW \n#BTC \n#NFTs '
'NFT News 👇\n\nTooneyChain: when NFTs invite themselves to the 2022 football world cup\n#TooneyChainNFT \n\n#WorldCup2022 #football #NFTs #NFTcollectibles '
'Mark my words. 3-1 USA over England this fall in the #WorldCup2022 . If we win, England has to call it soccer from now on.'
'you can as well use our services in projects that are not permanent like #Containerhouse or #structures for #WorldCup2022 ,also for #camps ,#swimmingpool #parks #offices #dormitory'
'#bhogleharsha Than we need to forgot about #BhuviOfficial ?👀\n#ipl2022 #WorldCup2022'
'Ghana🇬🇭 and Tunisia🇹🇳 have been invited to take part in the Kirin Super Cup 2022, which will be hosted by Japan from Friday, June 10 to Tuesday, June 14, 2022, in the lead up to the #WorldCup2022 in Qatar.\n\nThe other 2 participants are hosts Japan🇯🇵 and Chile🇨🇱 #AfricanFootball'
'join us on TG for our game of the day!⚽️⚽️ The World Cup token team is rooting for a #Cristiano goal today! Good luck to both sides today! #ManUtd 🔥 #ChelseaFC $WCT #WCT #WctArmy #WorldCup2022 #Qatar2022 #100xgem #BSC #BNB\xa0\xa0\xa0#BSCGem #BNBchain #1000xgem #moonshot #ETH #CRO #BTC '
'#julietbawuah #tv3_ghana Wow, super talented player. Who knows, with him on #Qatar’s team they might just win the #WorldCup2022. #Qatar2022 #Fooball #games'
'#ForthHelena #thevinetway #jodyvance Jody are you a QA asset?\nCan Jody tweet about LGBTQ issues? Notable events from around the world ignored & Jody is regurgitating CNn narratives?\nToo many ads not enough of what you want? SiriusXM commercial free entertainment.\n#Canucks \n#WorldCup2022\n#NFLDraft\n#TikTok '
'Congratulations, Danyel! You’re really having an impact in this field at just the right time! This, alongside your new co-authored book on #Qatar & the #WorldCup2022 , AND our #CIRSGUQ research project on the same subject! ! 👏👏 '
'Its the crypto world cup final and you are the manager.\nWhich front 3 are you going with?\n\n#xrp #vet #qnt #ada #zil #xlm #doge #btc #cro #eth #sand #luna #WorldCup2022\n#cryptocurrency'
'#yuzi_chahal I Would like to see KulCha Combination in #WorldCup2022'
'It’s going to be scary, but not for us 🇵🇹\n\n#Portugal #WorldCup2022 '
the error is shown below:
Py4JJavaError Traceback (most recent call last)
<ipython-input-5-bb31ef108289> in <module>()
88
89 if __name__ == "__main__":
---> 90 main()
3 frames
/content/spark-3.2.1-bin-hadoop3.2/python/lib/py4j-0.10.9.3-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o23.awaitTermination.
: org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/streaming/util.py", line 68, in call
r = self.func(t, *rdds)
File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/streaming/dstream.py", line 170, in takeAndPrint
taken = rdd.take(num + 1)
File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/rdd.py", line 1568, in take
res = self.context.runJob(self, takeUpToNumLeft, p)
File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/context.py", line 1227, in runJob
sock_info = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/py4j-0.10.9.3-src.zip/py4j/java_gateway.py", line 1322, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/py4j-0.10.9.3-src.zip/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0) (9557898acd7f executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 619, in main
process()
File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 611, in process
please how can i resolve it?
I see two problems with your code:
As far as I understood you want to create a directory /content/drive/MyDrive/Colab Notebooks/log/ with files following the log<number>.txt pattern. There seems to be a bug in this line:
with open('/content/drive/MyDrive/Colab Notebooks/log/log.txt'.format(a), 'w') as writefile:
You don't actually create multiple files but overwriting the single log.txt file on every iteration due to the missing curly brackets in filepath.
The second problem is much more complex: you are writing to the local filesystem in File.ipynb. StreamingContext.textFileStream accepts HDFS filepath as an argument. Please read the following chapter in the documentation: https://spark.apache.org/docs/latest/streaming-programming-guide.html

Problem of running python program on line command

I'm running twitter_hashtag_frequency.py programm on line command with a json file test.jsonl as parameter and I still have a below error however I validated this json file there is no a format problem.
C:\Users\HP\PycharmProjects\Bonzanini_Book_Exercises>python twitter_hashtag_frequency.py test.jsonl
Traceback (most recent call last):
File "twitter_hashtag_frequency.py", line 18, in <module>
tweet = json.loads(line)
File "C:\Users\HP\Python\Python38\lib\json\__init__.py", line 357, in loads
return _default_decoder.decode(s)
File "C:\Users\HP\Python\Python38\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Users\HP\Python\Python38\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 2 column 1 (char 1)
This is the content of test.jsonl:
{"created_at":"Tue Jul 21 00:47:40 +0000 2020","id":1285375860199972866,"id_str":"1285375860199972866","text":"RT #CBCAlerts: Big spike in new cases of COVID-19 in B.C., with 102 confirmed over weekend. 'We do have the possibility of having explosive\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":2564198800,"id_str":"2564198800","name":"Fayesella","screen_name":"frbaerwald","location":null,"url":null,"description":null,"translator_type":"none","protected":false,"verified":false,"followers_count":6,"friends_count":107,"listed_count":0,"favourites_count":1228,"statuses_count":88,"created_at":"Sun May 25 21:51:11 +0000 2014","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/1249529611760717826\/pmKLZKkR_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/1249529611760717826\/pmKLZKkR_normal.jpg","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Mon Jul 20 22:26:19 +0000 2020","id":1285340287104147457,"id_str":"1285340287104147457","text":"Big spike in new cases of COVID-19 in B.C., with 102 confirmed over weekend. 'We do have the possibility of having\u2026 https:\/\/t.co\/X9JH7qNj6o","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eTwitter Web App\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":19038934,"id_str":"19038934","name":"CBC News Alerts","screen_name":"CBCAlerts","location":"Toronto","url":"http:\/\/www.cbc.ca\/news\/","description":"Breaking national and international news alerts from CBC News, Canada's TV, radio, online and social media news leader.","translator_type":"none","protected":false,"verified":true,"followers_count":1304745,"friends_count":398,"listed_count":8806,"favourites_count":0,"statuses_count":142466,"created_at":"Thu Jan 15 21:03:19 +0000 2009","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme7\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme7\/bg.gif","profile_background_tile":false,"profile_link_color":"FF0000","profile_sidebar_border_color":"F2E195","profile_sidebar_fill_color":"FFF7CC","profile_text_color":"0C3E53","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/563807705530245120\/92toBEKN_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/563807705530245120\/92toBEKN_normal.jpeg","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"Big spike in new cases of COVID-19 in B.C., with 102 confirmed over weekend. 'We do have the possibility of having explosive growth here in our outbreak, if we're not careful,' Provincial Health Officer Dr. Bonnie Henry said. https:\/\/t.co\/dg1t2Q7MZU","display_text_range":[0,249],"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/dg1t2Q7MZU","expanded_url":"http:\/\/cbc.ca\/1.5655625","display_url":"cbc.ca\/1.5655625","indices":[226,249]}],"user_mentions":[],"symbols":[]}},"quote_count":48,"reply_count":26,"retweet_count":144,"favorite_count":208,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/X9JH7qNj6o","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/1285340287104147457","display_url":"twitter.com\/i\/web\/status\/1\u2026","indices":[116,139]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"CBCAlerts","name":"CBC News Alerts","id":19038934,"id_str":"19038934","indices":[3,13]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1595292460857"}
This is twitter_hashtag_frequency.py code :
import sys
from collections import Counter
import json
def get_hashtags(tweet):
entities = tweet.get('entities', {})
hashtags = entities.get('hashtags', [])
return [tag['text'].lower() for tag in hashtags]
if __name__ == '__main__':
fname = sys.argv[1]
with open(fname,'r') as f:
hashtags = Counter()
for line in f:
tweet = json.loads(line)
hashtags_in_tweet = get_hashtags(tweet)
hashtags.update(hashtags_in_tweet)
for tag, count in hashtags.most_common(20):
print("{}: {}".format(tag, count))
Someone can help me to solve this problem? Maybe something is wrong in the code I don't know. I'll appreciate your help, it's been days that I have this problem.

Python Regex fetch string correctly but still has AttributeError

I want to fetch ftp account information form vsftp log by regex.
All of our accounts were named by user plus number such as user01, user02, user03.
Tue Sep 12 18:11:20 2017 1 ::ffff:172.18.1.168 3620 /ftptest.py a _ i r user01 ftp 0 * c
Tue Sep 12 18:12:51 2017 1 ::ffff:172.18.1.168 4211 /ftptest.py a _ i r user02 ftp 0 * c
Tue Sep 12 18:16:43 2017 1 ::ffff:172.18.1.168 4322 /ftptest.py a _ i r user03 ftp 0 * c
My code is as below:
#!/usr/bin/python
import re
with open("/var/log/xferlog") as ftplog:
for line in ftplog:
line = line.strip("\n")
pattern = re.compile(r'user[\d]+')
match = pattern.search(line)
print match.group()
The result can fetch the user account but also show error message AttributeError: 'NoneType' object has no attribute 'group'
The result:
user01
user02
user03
Traceback (most recent call last):
File "test8.py", line 10, in <module>
print match.group()
AttributeError: 'NoneType' object has no attribute 'group'
Can anyone give me some advice?
pattern.search(line) return None if there is no match to line.
So your code must add a condition on that.
#!/usr/bin/python
import re
with open("/var/log/xferlog") as ftplog:
for line in ftplog:
line = line.strip("\n")
pattern = re.compile(r'user[\d]+')
match = pattern.search(line)
if match:
print match.group()
Regards Youenn.
Use a if statement to deal with the case where pattern does not match.
...
if match:
print match.group() # or anything
But note that this will silence all cases where there is no match. If you want to track those (maybe for debug) you can add
else:
print line
I'm not able to get user01, user02, user03 to print based on your sample data and code, but it looks like your regex isn't capturing the values correctly. To help you troubleshoot I'd recommend using using Python debugger to help you walk your code:
#!/usr/bin/python
import re
with open("sample") as ftplog:
for line in ftplog:
line = line.strip("\n")
pattern = re.compile(r'sparq[\d]+')
match = pattern.search(line)
if match is None:
import pdb; pdb.set_trace()
print match.group()

How can i get my files to be opened?

Hi there im working on a function that merges two separate .txt files and outputs a personalized letter. The problem is, is that i can include my text within the funciton module and it works perfectly. But when i try to open them in the function and to be used by the function i get this
error message:
Traceback (most recent call last):
File "/Users/nathandavis9752/CP104/davi0030_a10/src/q2_function.py", line 25, in
data = cleanData(q2)
File "/Users/nathandavis9752/CP104/davi0030_a10/src/q2_function.py", line 17, in cleanData
return [item.strip().split('\n\n') for item in query.split('--')]
AttributeError: 'file' object has no attribute 'split'
code:
letter = open('letter.txt', 'r')
q2 = open('q2.txt', 'r')
def cleanData(query):
return [item.strip().split('\n\n') for item in query.split('--')]
def writeLetter(template, variables, replacements):
# replace ith variable with ith replacement variable
for i in range(len(variables)):
template = template.replace(variables[i], replacements[i])
return template
data = cleanData(q2)
print (data)
variables = ['[fname]', '[lname]', '[street]', '[city]']
letters = [writeLetter(letter, variables, person) for person in data]
for i in letters:
print (i)
q2.txt file:
Michael
dawn
lock hart ln
Dublin
--
kate
Nan
webster st
king city
--
raj
zakjg
late Road
Toronto
--
dave
porter
Rock Ave
nobleton
letter.txt file:
[fname] [lname]
[street]
[city]
Dear [fname]:
As a fellow citizen of [city], you and all your neighbours
on [street] are invited to a celebration this Saturday at
[city]'s Central Park. Bring beer and food!
You are trying to split a file buffer rather than a string.
def cleanData(query):
return [item.strip().split('\n\n') for item in query.read().split('--')]

error with writelines(), when creating json objects

I wnat to convert the text file into json objects, and my inputfile i.e. text file has large number of objects(4 mb). It throw an error when I try to write the json objects into text file. Here is the error"writelines( ) argument must be a sequence of strings". Here is my input file:
created_at : 03 Ekim 2014 Cuma, 06:36, article : İSTANBUL (CİHAN)- Fethullah Gülen Hocaefendi'nin “421. Nağme: Şamatalarınız Haramîliğinizi Örtemeyecek!..” isimli yeni sohbeti, herkul.org sitesinde yayınlandı. Hocaefendi, "Şamatayla hangi şirretliği kapamak istediğini herkes anlıyor. Silinmez o zihinlerden" ifadelerini kullandı.Sohbetinde Allah Rasûlü (sallallâhu aleyhi ve sellem) Efendimiz’in, “Allahım beni kendi gözümde küçük, insanlar nazarında ise (yüklediğin misyona uygun.
created_at : 06 Ekim 2014 Pazartesi, 11:57, article : KAYSERİ (CİHAN)- Kimse Yok Mu Derneği Kayseri Şubesi, hayırseverlerin bağışlarıyla paket haline getirdiği kurban etlerini ihtiyaç sahiplerine ulaştırdı. Şehirde daha önce derneğe müracaatta bulunan ve tespit edilen aileler için şehrin 4 ayrı noktasında kurban eti dağıtım merkezi oluşturuldu. Kurban etlerini alan aileler ise Kimse Yok Mu ile yüzlerinin güldüğünü ve emeği geçenlere teşekkür ettiklerini söylediler. Geçen yıla göre ise bağış miktarlarının yüzde 50 oranında arttığı bildirildi. Kimse Yok Mu Derneği’nin Kayseri Şubesi’nde Kurban Bayramı nedeniyle hareketlilik yaşanıyor. Dernek, hayırseverlerin bağışladığı kurbanların kesimi yapıldıktan sonra. Here is my code:
#!usr/bin/python
import sys, os
import json
inputfile = open('bugun_data_collection_KimseYokmu.txt', 'r')
outputfile = open('bugun_data_collection_json_KimseYokmu.txt', 'w')
#shows how the dictionary looks like
reps = {"created_at": "date","article": "text"}
#reads the input file line by line
for line in inputfile:
outputfile.writelines((line, json.dumps(reps))
inputfile.close()
outputfile.close()
this is the error: "" line 11
inputfile.close()
^
SyntaxError: invalid syntax
Notice the error message: "writelines( ) argument must be a sequence of strings". It's throw out because the second element of the parameter(tuple type) is a dict not a string as expected. You can use json.dumps(reps) to convert it to string, like this:
outputfile.writelines((line, json.dumps(reps)))
Besides, you have put the file close operation in the for loop, this will cause another error when you write into or read from a closed file.
If you want to extract text from the input file, you can do it like this(did not deal with exceptions):
#reads the input file line by line
outputlines = []
for line in inputfile:
text = line.split('article : ')[1]
date = line.split('article : ')[0].split('created_at : ')[1]
reps = {"created_at": date,"article": text}
outputlines.append(json.dumps(reps))
outputfile.writelines(outputlines)

Categories

Resources