Problem of running python program on line command - python
I'm running twitter_hashtag_frequency.py programm on line command with a json file test.jsonl as parameter and I still have a below error however I validated this json file there is no a format problem.
C:\Users\HP\PycharmProjects\Bonzanini_Book_Exercises>python twitter_hashtag_frequency.py test.jsonl
Traceback (most recent call last):
File "twitter_hashtag_frequency.py", line 18, in <module>
tweet = json.loads(line)
File "C:\Users\HP\Python\Python38\lib\json\__init__.py", line 357, in loads
return _default_decoder.decode(s)
File "C:\Users\HP\Python\Python38\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Users\HP\Python\Python38\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 2 column 1 (char 1)
This is the content of test.jsonl:
{"created_at":"Tue Jul 21 00:47:40 +0000 2020","id":1285375860199972866,"id_str":"1285375860199972866","text":"RT #CBCAlerts: Big spike in new cases of COVID-19 in B.C., with 102 confirmed over weekend. 'We do have the possibility of having explosive\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":2564198800,"id_str":"2564198800","name":"Fayesella","screen_name":"frbaerwald","location":null,"url":null,"description":null,"translator_type":"none","protected":false,"verified":false,"followers_count":6,"friends_count":107,"listed_count":0,"favourites_count":1228,"statuses_count":88,"created_at":"Sun May 25 21:51:11 +0000 2014","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/1249529611760717826\/pmKLZKkR_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/1249529611760717826\/pmKLZKkR_normal.jpg","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Mon Jul 20 22:26:19 +0000 2020","id":1285340287104147457,"id_str":"1285340287104147457","text":"Big spike in new cases of COVID-19 in B.C., with 102 confirmed over weekend. 'We do have the possibility of having\u2026 https:\/\/t.co\/X9JH7qNj6o","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eTwitter Web App\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":19038934,"id_str":"19038934","name":"CBC News Alerts","screen_name":"CBCAlerts","location":"Toronto","url":"http:\/\/www.cbc.ca\/news\/","description":"Breaking national and international news alerts from CBC News, Canada's TV, radio, online and social media news leader.","translator_type":"none","protected":false,"verified":true,"followers_count":1304745,"friends_count":398,"listed_count":8806,"favourites_count":0,"statuses_count":142466,"created_at":"Thu Jan 15 21:03:19 +0000 2009","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme7\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme7\/bg.gif","profile_background_tile":false,"profile_link_color":"FF0000","profile_sidebar_border_color":"F2E195","profile_sidebar_fill_color":"FFF7CC","profile_text_color":"0C3E53","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/563807705530245120\/92toBEKN_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/563807705530245120\/92toBEKN_normal.jpeg","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"Big spike in new cases of COVID-19 in B.C., with 102 confirmed over weekend. 'We do have the possibility of having explosive growth here in our outbreak, if we're not careful,' Provincial Health Officer Dr. Bonnie Henry said. https:\/\/t.co\/dg1t2Q7MZU","display_text_range":[0,249],"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/dg1t2Q7MZU","expanded_url":"http:\/\/cbc.ca\/1.5655625","display_url":"cbc.ca\/1.5655625","indices":[226,249]}],"user_mentions":[],"symbols":[]}},"quote_count":48,"reply_count":26,"retweet_count":144,"favorite_count":208,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/X9JH7qNj6o","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/1285340287104147457","display_url":"twitter.com\/i\/web\/status\/1\u2026","indices":[116,139]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"CBCAlerts","name":"CBC News Alerts","id":19038934,"id_str":"19038934","indices":[3,13]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1595292460857"}
This is twitter_hashtag_frequency.py code :
import sys
from collections import Counter
import json
def get_hashtags(tweet):
entities = tweet.get('entities', {})
hashtags = entities.get('hashtags', [])
return [tag['text'].lower() for tag in hashtags]
if __name__ == '__main__':
fname = sys.argv[1]
with open(fname,'r') as f:
hashtags = Counter()
for line in f:
tweet = json.loads(line)
hashtags_in_tweet = get_hashtags(tweet)
hashtags.update(hashtags_in_tweet)
for tag, count in hashtags.most_common(20):
print("{}: {}".format(tag, count))
Someone can help me to solve this problem? Maybe something is wrong in the code I don't know. I'll appreciate your help, it's been days that I have this problem.
Related
I can't do json()['graphql']['user'] for instagram API
I am trying to make a tool that gets all the information in JSON out of a Instagram profile page. Such as example : https://www.instagram.com/dave_saa/?__a=1 Whenever I try to do that, I get a error. The error is : simplejson.errors.JSONDecodeError: Expecting value: line 1 column 1 (char 0). ERROR IN MORE DETAIL Traceback (most recent call last): File "C:\Users\disco\PycharmProjects\IgOSINT\main.py", line 9, in <module> json_found_for_site = request_for_site.json() File "C:\Users\disco\AppData\Local\Programs\Python\Python39\lib\site-packages\requests\models.py", line 910, in json return complexjson.loads(self.text, **kwargs) File "C:\Users\disco\AppData\Local\Programs\Python\Python39\lib\site-packages\simplejson\__init__.py", line 525, in loads return _default_decoder.decode(s) File "C:\Users\disco\AppData\Local\Programs\Python\Python39\lib\site-packages\simplejson\decoder.py", line 370, in decode obj, end = self.raw_decode(s) File "C:\Users\disco\AppData\Local\Programs\Python\Python39\lib\site-packages\simplejson\decoder.py", line 400, in raw_decode return self.scan_once(s, idx=_w(s, idx).end()) simplejson.errors.JSONDecodeError: Expecting value: line 1 column 1 (char 0) MY PYTHON CODE import requests from termcolor import colored import json import simplejson target = str(input(colored('[+] Enter Target Username: ', 'blue'))) request_for_site = requests.get('https://www.instagram.com/' + target + '/?__a=1') json_found_for_site = request_for_site.json()['graphql']['user'] if (request_for_site.status_code == 200): print(colored('[+++] TARGET FOUND !', 'green')) print(colored( ''' [1] USERNAME [2] FULL NAME [3] BIO [4] HIGHLIGHTS [5] PHONE NUMBER [6] IS ACCOUNT PRIVATE OR PUBLIC [recommended FIRST] [7] Profile Picture [8] Followers [9] Followed [10] ID [11] IS VERIFIED ''', 'red' )) tool_option = str(input(colored('[+] ENTER NUMBER OPTION TO FIND: ', 'blue'))) if (tool_option == '1'): print(json_found_for_site['username']) elif (tool_option == '2'): print(json_found_for_site['full_name']) The program is still in development and not finished. But I get error, so that is why I stopped temporally and I asked help here. Someone help, please.
The document you are parsing is HTML, not JSON. You cannot parse HTML with a JSON parser, to parse HTML, you need an HTML parser.
Fetch data from SQL Server and convert to JSON / Error Expecting value: line 1 column 1 (char 0)
I would like fetch data from a SQL Server database and transform the result in a JSON format. Thats not difficult, but one column is already a JSON and I would like separate it, but the result is a little bit confusing. My code: rows = cursor.fetchall() print (rows) objects_list = [] for row in rows: d= collections.OrderedDict() #d["Bestelldatum"]= row[0].strftime("%Y-%m-%d %H:%M") a = json.loads(row[0]) #print(a) #d["Adresse"] = row[0] #d["Tor"] = a["tor"] #d["Stiege"] = a["stg"] #d["Stock"] = a["stk"] #d["Tür"] = a["tür"] #d["PLZ"] = a["plz"] objects_list.append(d) j = json.dumps(objects_list, ensure_ascii=False) print (j) My problem is row 1, that is a JSON, see the pic Picture from database If I run it so, I get this error messange in loads return _default_decoder.decode(s) File "C:\Users\acas1\AppData\Local\Programs\Python\Python39\lib\json\decoder.py", line 337, in decode obj, end = self.raw_decode(s, idx=_w(s, 0).end()) File "C:\Users\acas1\AppData\Local\Programs\Python\Python39\lib\json\decoder.py", line 355, in raw_decode raise JSONDecodeError("Expecting value", s, err.value) from None json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) I guess something is wrong with the JSON data which I get from the database I don't understand why some data have backslashes, because at SQL they not exist. Thats the result if I fetch just the raw data from the database: Picture from result in python I tried everything that they say in this post JSONDecodeError: Expecting value: line 1 column 1 (char 0) but nothing helps. I hope someone have an idea
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) - Error only occurs when code is nested
I am currently working on a project that retrieves data about car auctions, I have it set up to request a custom Ebay URL that uses their api, I request the page and convert it to a JSON for handling. The code runs with no errors at all if the code is by itself but if I put it within a function or within a conditional statement or anything else that means it is nested it will give me the JSON error json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) My code is here, however I dont know if it is an issue with my code as it works fine when it is not within a function ebayurl = "http://svcs.ebay.com/services/search/FindingService/v1?\ SECURITY-APPNAME=KyleOsbo-CarSearc-PRD-adf6708f9-c75353fe\ &OPERATION-NAME=findItemsAdvanced\ &SERVICE-VERSION=1.13.0\ &GLOBAL-ID=EBAY-GB\ &RESPONSE-DATA-FORMAT=JSON\ &REST-PAYLOAD\ &categoryId(0)=9801\ &outputSelector(0)=SellerInfo\ &keywords="+"honda%20civic" #The custom url was created based on my needs, I only want to search ebay uk and only within the cars category apiResult = requests.get(ebayurl) #Request the custom url parsedresult = apiResult.json() #Convert url to json format in order to extract information easier for item in (parsedresult["findItemsAdvancedResponse"][0]["searchResult"][0]["item"]): #JSON is set up as multi dimensional array, looks within it to extract values title = item["title"][0] price = item["sellingStatus"][0]["convertedCurrentPrice"][0]["__value__"] itemURL = item["viewItemURL"][0] location = item["location"][0] itemid = item["itemId"][0] with sqlite3.connect("results.db") as db: #Connecting to table ready to insert new records cursor = db.cursor() values = (itemid, title, price, location, itemURL) #Declaring values that will be inserted,preventing sql injection, these values will change upon every iteration sql = """ INSERT INTO ebay_results(item_id, title, price, location, itemURL) VALUES(?,?,?,?,?) """ cursor.execute(sql, values) #Inserts a new record for every item found db.commit() The error occurs at parsedresult = apiResult.json() Traceback (most recent call last): File "C:\Users\ikoze\Documents\Computer Science\Coursework files\carrySearch.py", line 94, in <module> parsedresult = apiResult.json() #Convert url to json format in order to extract information easier File "C:\Users\ikoze\AppData\Local\Programs\Python\Python36-32\lib\site- packages\requests\models.py", line 892, in json return complexjson.loads(self.text, **kwargs) File "C:\Users\ikoze\AppData\Local\Programs\Python\Python36- 32\lib\json__init__.py", line 354, in loads return _default_decoder.decode(s) File "C:\Users\ikoze\AppData\Local\Programs\Python\Python36- 32\lib\json\decoder.py", line 339, in decode obj, end = self.raw_decode(s, idx=_w(s, 0).end()) File "C:\Users\ikoze\AppData\Local\Programs\Python\Python36- 32\lib\json\decoder.py", line 357, in raw_decode raise JSONDecodeError("Expecting value", s, err.value) from None json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Tokenizing and Removing Stopwords from JSON using nltk
Hi I keep getting this error: D:\WinPython-32bit-2.7.10.3\python-2.7.10>python TweetTest.py Twitter.json Traceback (most recent call last): File "TweetTest.py", line 60, in <module> tweet = json.loads(line) File "D:\WinPython-32bit-2.7.10.3\python-2.7.10\lib\json\__init__.py", line 338, in loads return _default_decoder.decode(s) File "D:\WinPython-32bit-2.7.10.3\python-2.7.10\lib\json\decoder.py", line 369, in decode raise ValueError(errmsg("Extra data", s, end, len(s))) ValueError: Extra data: line 1 column 4488 - line 1 column 99678411 (char 4487 - 99678410) I have no idea what is wrong. My code is as follows: import sys import json from collections import Counter import re from nltk.corpus import stopwords import string punctuation = list(string.punctuation) stop = stopwords.words('english') + punctuation + ['rt', 'via'] emoticons_str = r""" (?: [:=;] # Eyes [oO\-]? # Nose (optional) [D\)\]\(\]/\\OpP] # Mouth )""" regex_str = [ emoticons_str, r'<[^>]+>', # HTML tags r'(?:#[\w_]+)', # #-mentions r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags r'http[s]?://(?:[a-z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and ' r'(?:[\w_]+)', # other words r'(?:\S)' # anything else ] tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE) emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE) def tokenize(s): return tokens_re.findall(s) def preprocess(s, lowercase=False): tokens = tokenize(s) if lowercase: tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens] return tokens if __name__ == '__main__': fname = sys.argv[1] with open(fname, 'r') as f: count_all = Counter() for line in f: tweet = json.loads(line) tokens = preprocess(tweet['text']) count_all.update(tokens) print(count_all.most_common(5)) This is the first two output of my JSON file. I have used a Tweet Stream listener to collect the tweets. {"created_at":"Wed Apr 06 08:33:55 +0000 2016","id":717631408345333760,"id_str":"717631408345333760","text":"RT #whosharold: Hilary Clinton cannot be president pls she can't even hold her man down what makes ya think she gon hold the office down","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":472387071,"id_str":"472387071","name":"BigGucciK 2x","screen_name":"KaisonThatBoy","location":"Bridgeport, CT","url":null,"description":null,"protected":false,"verified":false,"followers_count":1608,"friends_count":1219,"listed_count":8,"favourites_count":1293,"statuses_count":64337,"created_at":"Mon Jan 23 22:07:27 +0000 2012","utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/709500377104818182\/4vMu066C_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/709500377104818182\/4vMu066C_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/472387071\/1457000395","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Apr 06 03:16:15 +0000 2016","id":717551464575401984,"id_str":"717551464575401984","text":"Hilary Clinton cannot be president pls she can't even hold her man down what makes ya think she gon hold the office down","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":792436550,"id_str":"792436550","name":"sadboyz","screen_name":"whosharold","location":null,"url":null,"description":"platano maduro no vuelve a verde","protected":false,"verified":false,"followers_count":1285,"friends_count":979,"listed_count":11,"favourites_count":4877,"statuses_count":91425,"created_at":"Thu Aug 30 21:26:30 +0000 2012","utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/773304539\/94dbc3d1558da7f1e3d2c6fffcb5d710.jpeg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/773304539\/94dbc3d1558da7f1e3d2c6fffcb5d710.jpeg","profile_background_tile":true,"profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/714669878012219392\/9HmilvPG_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/714669878012219392\/9HmilvPG_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/792436550\/1458855437","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":2,"favorite_count":7,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"whosharold","name":"sadboyz","id":792436550,"id_str":"792436550","indices":[3,14]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1459931635353"} {"created_at":"Wed Apr 06 08:33:55 +0000 2016","id":717631409742020609,"id_str":"717631409742020609","text":"RT #WisegalGranny: HONY Just Destroyed Donald Trump\u2019s Dream Of Becoming President - https:\/\/t.co\/8GIDVa76bZ Oooo, that's gonna hurt! #Unite\u2026","source":"\u003ca href=\"https:\/\/roundteam.co\" rel=\"nofollow\"\u003eRoundTeam\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":2846552432,"id_str":"2846552432","name":"Glenn Silva","screen_name":"GlennSilva76","location":"hawaii","url":null,"description":"Christian, Constitutional Conservative, Pro 1A 2A and RF, It's Time To Unite And Take Our Country Back! #NeverTrump\r\n#UniteWithCruz #CruzCrew #CruzToVictory","protected":false,"verified":false,"followers_count":1981,"friends_count":2408,"listed_count":99,"favourites_count":1819,"statuses_count":38301,"created_at":"Wed Oct 08 07:34:50 +0000 2014","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/691834454868889601\/1gkIbY1C_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/691834454868889601\/1gkIbY1C_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/2846552432\/1453447926","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Apr 06 08:18:04 +0000 2016","id":717627418454966272,"id_str":"717627418454966272","text":"HONY Just Destroyed Donald Trump\u2019s Dream Of Becoming President - https:\/\/t.co\/8GIDVa76bZ Oooo, that's gonna hurt! #UniteWithCruz #NeverTrump","source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":4726275950,"id_str":"4726275950","name":"Wisegal1958","screen_name":"WisegalGranny","location":null,"url":null,"description":null,"protected":false,"verified":false,"followers_count":475,"friends_count":290,"listed_count":73,"favourites_count":8976,"statuses_count":10881,"created_at":"Fri Jan 08 02:36:28 +0000 2016","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"F5F8FA","profile_background_image_url":"","profile_background_image_url_https":"","profile_background_tile":false,"profile_link_color":"2B7BB9","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/715082668770242561\/ohjXvK85_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/715082668770242561\/ohjXvK85_normal.jpg","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":1,"favorite_count":0,"entities":{"hashtags":[{"text":"UniteWithCruz","indices":[114,128]},{"text":"NeverTrump","indices":[129,140]}],"urls":[{"url":"https:\/\/t.co\/8GIDVa76bZ","expanded_url":"http:\/\/www.parhlo.com\/hony-just-destroyed-trumps-dream-of-becoming-president\/?track=twb","display_url":"parhlo.com\/hony-just-dest\u2026","indices":[65,88]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"UniteWithCruz","indices":[133,140]},{"text":"NeverTrump","indices":[139,140]}],"urls":[{"url":"https:\/\/t.co\/8GIDVa76bZ","expanded_url":"http:\/\/www.parhlo.com\/hony-just-destroyed-trumps-dream-of-becoming-president\/?track=twb","display_url":"parhlo.com\/hony-just-dest\u2026","indices":[84,107]}],"user_mentions":[{"screen_name":"WisegalGranny","name":"Wisegal1958","id":4726275950,"id_str":"4726275950","indices":[3,17]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1459931635686"} Please help me. Thank you.
I had the same error once. Your script loads a JSON object at each line read, the issue might be that your JSON objects are not separated by a newline. For instance if your file contains json_oject1 json_oject2 then the two objects will be read whereas if the file contains json_oject1 json_oject2 you will get an error. Solution: add a newline when writing a new JSON object to the output file. (related: https://stackoverflow.com/a/21058946/2314737)
Converting Google Ajax Search to a python dictionary
Im trying to write a script that gets google's ajax search results (For example: http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=filetype:pdf ) and download every file. Right now I'm stuck trying to convert the response to a python dictionary so its easier to move through. import subprocess import ast subprocess.call("curl -G -d 'q=filetype:pdf&v=1.0' http://ajax.googleapis.com/ajax/services/search/web > output",stderr=subprocess.STDOUT,shell=True) file = open('output','r') contents = file.read() output_dict = ast.literal_eval(contents) print output_dict When I run it, I get: $ python script.py % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 2643 0 2643 0 0 15926 0 --:--:-- --:--:-- --:--:-- 26696 Traceback (most recent call last): File "script.py", line 7, in <module> output_dict = ast.literal_eval(contents) File "/usr/lib/python2.7/ast.py", line 80, in literal_eval return _convert(node_or_string) File "/usr/lib/python2.7/ast.py", line 63, in _convert in zip(node.keys, node.values)) File "/usr/lib/python2.7/ast.py", line 62, in <genexpr> return dict((_convert(k), _convert(v)) for k, v File "/usr/lib/python2.7/ast.py", line 79, in _convert raise ValueError('malformed string') ValueError: malformed string The file looks like: {"responseData": {"results":[{"GsearchResultClass":"GwebSearch", "unescapedUrl":"http://www.foundationdb.com/AlphaLicenseAgreement.pdf", "url":"http://www.foundationdb.com/AlphaLicenseAgreement.pdf", "visibleUrl":"www.foundationdb.com", "cacheUrl":"http://www.google.com/search?q\u003dcache:W7zhFlfbm6UJ:www.foundationdb.com", "title":"FoundationDB Alpha Software Evaluation License Agreement", "titleNoFormatting":"FoundationDB Alpha Software Evaluation License Agreement", "content":"FOUNDATIONDB. ALPHA SOFTWARE EVALUATION LICENSE AGREEMENT. PLEASE READ CAREFULLY THE TERMS OF THIS ALPHA SOFTWARE \u003cb\u003e...\u003c/b\u003e", "fileFormat":"PDF/Adobe Acrobat" }, {"GsearchResultClass":"GwebSearch", "unescapedUrl":"https://subreg.cz/registration_agreement.pdf", "url":"https://subreg.cz/registration_agreement.pdf", "visibleUrl":"subreg.cz", "cacheUrl":"http://www.google.com/search?q\u003dcache:ODtRmQsiHD0J:subreg.cz", "title":"Registration Agreement", "titleNoFormatting":"Registration Agreement", "content":"Registration Agreement. In order to complete the registration process you must read and agree to be bound by all terms and conditions herein. TERMS AND \u003cb\u003e...\u003c/b\u003e", "fileFormat":"PDF/Adobe Acrobat" }, {"GsearchResultClass":"GwebSearch", "unescapedUrl":"http://supportdetails.com/export.pdf", "url":"http://supportdetails.com/export.pdf", "visibleUrl":"supportdetails.com", "cacheUrl":"http://www.google.com/search?q\u003dcache:h0LvxrTTKzIJ:supportdetails.com", "title":"Export PDF - Support Details", "titleNoFormatting":"Export PDF - Support Details", "content":"", "fileFormat":"PDF/Adobe Acrobat" }, {"GsearchResultClass":"GwebSearch", "unescapedUrl":"http://www.fws.gov/le/pdf/travelpetbird.pdf", "url":"http://www.fws.gov/le/pdf/travelpetbird.pdf", "visibleUrl":"www.fws.gov", "cacheUrl":"", "title":"pet bird", "titleNoFormatting":"pet bird", "content":"U.S. Fish \u0026amp; Wildlife Service. Traveling Abroad with. Your Pet Bird. The Wild Bird Conservation Act (Act), a significant step in international conservation efforts to \u003cb\u003e...\u003c/b\u003e", "fileFormat":"PDF/Adobe Acrobat" }], "cursor":{"resultCount":"72,800,000", "pages":[{"start":"0","label":1}, {"start":"4","label":2}, {"start":"8","label":3}, {"start":"12","label":4}, {"start":"16","label":5}, {"start":"20","label":6}, {"start":"24","label":7}, {"start":"28","label":8}], "estimatedResultCount":"72800000", "currentPageIndex":0, "moreResultsUrl":"http://www.google.com/search?oe\u003dutf8\u0026ie\u003dutf8\u0026source\u003duds\u0026start\u003d0\u0026hl\u003den\u0026q\u003dfiletype:pdf","searchResultTime":"0.04" } }, "responseDetails": null, "responseStatus": 200 } God that took forever to format
Google returns JSON, so use the json module instead of the ast module you are using now. file = open('output','r') output_dict = json.load(file) You may also want to study the urllib2 module to load the URL response instead of relying on curl.