Problem of running python program on line command - python

I'm running twitter_hashtag_frequency.py programm on line command with a json file test.jsonl as parameter and I still have a below error however I validated this json file there is no a format problem.
C:\Users\HP\PycharmProjects\Bonzanini_Book_Exercises>python twitter_hashtag_frequency.py test.jsonl
Traceback (most recent call last):
File "twitter_hashtag_frequency.py", line 18, in <module>
tweet = json.loads(line)
File "C:\Users\HP\Python\Python38\lib\json\__init__.py", line 357, in loads
return _default_decoder.decode(s)
File "C:\Users\HP\Python\Python38\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Users\HP\Python\Python38\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 2 column 1 (char 1)
This is the content of test.jsonl:
{"created_at":"Tue Jul 21 00:47:40 +0000 2020","id":1285375860199972866,"id_str":"1285375860199972866","text":"RT #CBCAlerts: Big spike in new cases of COVID-19 in B.C., with 102 confirmed over weekend. 'We do have the possibility of having explosive\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":2564198800,"id_str":"2564198800","name":"Fayesella","screen_name":"frbaerwald","location":null,"url":null,"description":null,"translator_type":"none","protected":false,"verified":false,"followers_count":6,"friends_count":107,"listed_count":0,"favourites_count":1228,"statuses_count":88,"created_at":"Sun May 25 21:51:11 +0000 2014","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/1249529611760717826\/pmKLZKkR_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/1249529611760717826\/pmKLZKkR_normal.jpg","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Mon Jul 20 22:26:19 +0000 2020","id":1285340287104147457,"id_str":"1285340287104147457","text":"Big spike in new cases of COVID-19 in B.C., with 102 confirmed over weekend. 'We do have the possibility of having\u2026 https:\/\/t.co\/X9JH7qNj6o","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eTwitter Web App\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":19038934,"id_str":"19038934","name":"CBC News Alerts","screen_name":"CBCAlerts","location":"Toronto","url":"http:\/\/www.cbc.ca\/news\/","description":"Breaking national and international news alerts from CBC News, Canada's TV, radio, online and social media news leader.","translator_type":"none","protected":false,"verified":true,"followers_count":1304745,"friends_count":398,"listed_count":8806,"favourites_count":0,"statuses_count":142466,"created_at":"Thu Jan 15 21:03:19 +0000 2009","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme7\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme7\/bg.gif","profile_background_tile":false,"profile_link_color":"FF0000","profile_sidebar_border_color":"F2E195","profile_sidebar_fill_color":"FFF7CC","profile_text_color":"0C3E53","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/563807705530245120\/92toBEKN_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/563807705530245120\/92toBEKN_normal.jpeg","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"Big spike in new cases of COVID-19 in B.C., with 102 confirmed over weekend. 'We do have the possibility of having explosive growth here in our outbreak, if we're not careful,' Provincial Health Officer Dr. Bonnie Henry said. https:\/\/t.co\/dg1t2Q7MZU","display_text_range":[0,249],"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/dg1t2Q7MZU","expanded_url":"http:\/\/cbc.ca\/1.5655625","display_url":"cbc.ca\/1.5655625","indices":[226,249]}],"user_mentions":[],"symbols":[]}},"quote_count":48,"reply_count":26,"retweet_count":144,"favorite_count":208,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/X9JH7qNj6o","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/1285340287104147457","display_url":"twitter.com\/i\/web\/status\/1\u2026","indices":[116,139]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"CBCAlerts","name":"CBC News Alerts","id":19038934,"id_str":"19038934","indices":[3,13]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1595292460857"}
This is twitter_hashtag_frequency.py code :
import sys
from collections import Counter
import json
def get_hashtags(tweet):
entities = tweet.get('entities', {})
hashtags = entities.get('hashtags', [])
return [tag['text'].lower() for tag in hashtags]
if __name__ == '__main__':
fname = sys.argv[1]
with open(fname,'r') as f:
hashtags = Counter()
for line in f:
tweet = json.loads(line)
hashtags_in_tweet = get_hashtags(tweet)
hashtags.update(hashtags_in_tweet)
for tag, count in hashtags.most_common(20):
print("{}: {}".format(tag, count))
Someone can help me to solve this problem? Maybe something is wrong in the code I don't know. I'll appreciate your help, it's been days that I have this problem.

Related

I can't do json()['graphql']['user'] for instagram API

I am trying to make a tool that gets all the information in JSON out of a Instagram profile page.
Such as example : https://www.instagram.com/dave_saa/?__a=1
Whenever I try to do that, I get a error. The error is : simplejson.errors.JSONDecodeError: Expecting value: line 1 column 1 (char 0).
ERROR IN MORE DETAIL
Traceback (most recent call last):
File "C:\Users\disco\PycharmProjects\IgOSINT\main.py", line 9, in <module>
json_found_for_site = request_for_site.json()
File "C:\Users\disco\AppData\Local\Programs\Python\Python39\lib\site-packages\requests\models.py", line 910, in json
return complexjson.loads(self.text, **kwargs)
File "C:\Users\disco\AppData\Local\Programs\Python\Python39\lib\site-packages\simplejson\__init__.py", line 525, in loads
return _default_decoder.decode(s)
File "C:\Users\disco\AppData\Local\Programs\Python\Python39\lib\site-packages\simplejson\decoder.py", line 370, in decode
obj, end = self.raw_decode(s)
File "C:\Users\disco\AppData\Local\Programs\Python\Python39\lib\site-packages\simplejson\decoder.py", line 400, in raw_decode
return self.scan_once(s, idx=_w(s, idx).end())
simplejson.errors.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
MY PYTHON CODE
import requests
from termcolor import colored
import json
import simplejson
target = str(input(colored('[+] Enter Target Username: ', 'blue')))
request_for_site = requests.get('https://www.instagram.com/' + target + '/?__a=1')
json_found_for_site = request_for_site.json()['graphql']['user']
if (request_for_site.status_code == 200):
print(colored('[+++] TARGET FOUND !', 'green'))
print(colored(
'''
[1] USERNAME
[2] FULL NAME
[3] BIO
[4] HIGHLIGHTS
[5] PHONE NUMBER
[6] IS ACCOUNT PRIVATE OR PUBLIC [recommended FIRST]
[7] Profile Picture
[8] Followers
[9] Followed
[10] ID
[11] IS VERIFIED
''', 'red'
))
tool_option = str(input(colored('[+] ENTER NUMBER OPTION TO FIND: ', 'blue')))
if (tool_option == '1'):
print(json_found_for_site['username'])
elif (tool_option == '2'):
print(json_found_for_site['full_name'])
The program is still in development and not finished. But I get error, so that is why I stopped temporally and I asked help here.
Someone help, please.
The document you are parsing is HTML, not JSON. You cannot parse HTML with a JSON parser, to parse HTML, you need an HTML parser.

Fetch data from SQL Server and convert to JSON / Error Expecting value: line 1 column 1 (char 0)

I would like fetch data from a SQL Server database and transform the result in a JSON format.
Thats not difficult, but one column is already a JSON and I would like separate it, but the result is a little bit confusing.
My code:
rows = cursor.fetchall()
print (rows)
objects_list = []
for row in rows:
d= collections.OrderedDict()
#d["Bestelldatum"]= row[0].strftime("%Y-%m-%d %H:%M")
a = json.loads(row[0])
#print(a)
#d["Adresse"] = row[0]
#d["Tor"] = a["tor"]
#d["Stiege"] = a["stg"]
#d["Stock"] = a["stk"]
#d["Tür"] = a["tür"]
#d["PLZ"] = a["plz"]
objects_list.append(d)
j = json.dumps(objects_list, ensure_ascii=False)
print (j)
My problem is row 1, that is a JSON, see the pic
Picture from database
If I run it so, I get this error messange
in loads
return _default_decoder.decode(s)
File "C:\Users\acas1\AppData\Local\Programs\Python\Python39\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Users\acas1\AppData\Local\Programs\Python\Python39\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
I guess something is wrong with the JSON data which I get from the database
I don't understand why some data have backslashes, because at SQL they not exist.
Thats the result if I fetch just the raw data from the database:
Picture from result in python
I tried everything that they say in this post JSONDecodeError: Expecting value: line 1 column 1 (char 0)
but nothing helps.
I hope someone have an idea

json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) - Error only occurs when code is nested

I am currently working on a project that retrieves data about car auctions, I have it set up to request a custom Ebay URL that uses their api, I request the page and convert it to a JSON for handling. The code runs with no errors at all if the code is by itself but if I put it within a function or within a conditional statement or anything else that means it is nested it will give me the JSON error
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
My code is here, however I dont know if it is an issue with my code as it works fine when it is not within a function
ebayurl = "http://svcs.ebay.com/services/search/FindingService/v1?\
SECURITY-APPNAME=KyleOsbo-CarSearc-PRD-adf6708f9-c75353fe\
&OPERATION-NAME=findItemsAdvanced\
&SERVICE-VERSION=1.13.0\
&GLOBAL-ID=EBAY-GB\
&RESPONSE-DATA-FORMAT=JSON\
&REST-PAYLOAD\
&categoryId(0)=9801\
&outputSelector(0)=SellerInfo\
&keywords="+"honda%20civic" #The custom url was created based on my needs, I only want to search ebay uk and only within the cars category
apiResult = requests.get(ebayurl) #Request the custom url
parsedresult = apiResult.json() #Convert url to json format in order to extract information easier
for item in (parsedresult["findItemsAdvancedResponse"][0]["searchResult"][0]["item"]): #JSON is set up as multi dimensional array, looks within it to extract values
title = item["title"][0]
price = item["sellingStatus"][0]["convertedCurrentPrice"][0]["__value__"]
itemURL = item["viewItemURL"][0]
location = item["location"][0]
itemid = item["itemId"][0]
with sqlite3.connect("results.db") as db: #Connecting to table ready to insert new records
cursor = db.cursor()
values = (itemid, title, price, location, itemURL) #Declaring values that will be inserted,preventing sql injection, these values will change upon every iteration
sql = """ INSERT INTO ebay_results(item_id, title, price, location, itemURL)
VALUES(?,?,?,?,?)
"""
cursor.execute(sql, values) #Inserts a new record for every item found
db.commit()
The error occurs at
parsedresult = apiResult.json()
Traceback (most recent call last):
File "C:\Users\ikoze\Documents\Computer Science\Coursework
files\carrySearch.py", line 94, in <module>
parsedresult = apiResult.json() #Convert url to json format in order to
extract information easier
File "C:\Users\ikoze\AppData\Local\Programs\Python\Python36-32\lib\site-
packages\requests\models.py", line 892, in json
return complexjson.loads(self.text, **kwargs)
File "C:\Users\ikoze\AppData\Local\Programs\Python\Python36-
32\lib\json__init__.py", line 354, in loads
return _default_decoder.decode(s)
File "C:\Users\ikoze\AppData\Local\Programs\Python\Python36-
32\lib\json\decoder.py", line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Users\ikoze\AppData\Local\Programs\Python\Python36-
32\lib\json\decoder.py", line 357, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

Tokenizing and Removing Stopwords from JSON using nltk

Hi I keep getting this error:
D:\WinPython-32bit-2.7.10.3\python-2.7.10>python TweetTest.py Twitter.json
Traceback (most recent call last):
File "TweetTest.py", line 60, in <module>
tweet = json.loads(line)
File "D:\WinPython-32bit-2.7.10.3\python-2.7.10\lib\json\__init__.py", line 338, in loads
return _default_decoder.decode(s)
File "D:\WinPython-32bit-2.7.10.3\python-2.7.10\lib\json\decoder.py", line 369, in decode
raise ValueError(errmsg("Extra data", s, end, len(s)))
ValueError: Extra data: line 1 column 4488 - line 1 column 99678411 (char 4487 - 99678410)
I have no idea what is wrong. My code is as follows:
import sys
import json
from collections import Counter
import re
from nltk.corpus import stopwords
import string
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:#[\w_]+)', # #-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
if __name__ == '__main__':
fname = sys.argv[1]
with open(fname, 'r') as f:
count_all = Counter()
for line in f:
tweet = json.loads(line)
tokens = preprocess(tweet['text'])
count_all.update(tokens)
print(count_all.most_common(5))
This is the first two output of my JSON file. I have used a Tweet Stream listener to collect the tweets.
{"created_at":"Wed Apr 06 08:33:55 +0000 2016","id":717631408345333760,"id_str":"717631408345333760","text":"RT #whosharold: Hilary Clinton cannot be president pls she can't even hold her man down what makes ya think she gon hold the office down","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":472387071,"id_str":"472387071","name":"BigGucciK 2x","screen_name":"KaisonThatBoy","location":"Bridgeport, CT","url":null,"description":null,"protected":false,"verified":false,"followers_count":1608,"friends_count":1219,"listed_count":8,"favourites_count":1293,"statuses_count":64337,"created_at":"Mon Jan 23 22:07:27 +0000 2012","utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/709500377104818182\/4vMu066C_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/709500377104818182\/4vMu066C_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/472387071\/1457000395","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Apr 06 03:16:15 +0000 2016","id":717551464575401984,"id_str":"717551464575401984","text":"Hilary Clinton cannot be president pls she can't even hold her man down what makes ya think she gon hold the office down","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":792436550,"id_str":"792436550","name":"sadboyz","screen_name":"whosharold","location":null,"url":null,"description":"platano maduro no vuelve a verde","protected":false,"verified":false,"followers_count":1285,"friends_count":979,"listed_count":11,"favourites_count":4877,"statuses_count":91425,"created_at":"Thu Aug 30 21:26:30 +0000 2012","utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/773304539\/94dbc3d1558da7f1e3d2c6fffcb5d710.jpeg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/773304539\/94dbc3d1558da7f1e3d2c6fffcb5d710.jpeg","profile_background_tile":true,"profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/714669878012219392\/9HmilvPG_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/714669878012219392\/9HmilvPG_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/792436550\/1458855437","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":2,"favorite_count":7,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"whosharold","name":"sadboyz","id":792436550,"id_str":"792436550","indices":[3,14]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1459931635353"}
{"created_at":"Wed Apr 06 08:33:55 +0000 2016","id":717631409742020609,"id_str":"717631409742020609","text":"RT #WisegalGranny: HONY Just Destroyed Donald Trump\u2019s Dream Of Becoming President - https:\/\/t.co\/8GIDVa76bZ Oooo, that's gonna hurt! #Unite\u2026","source":"\u003ca href=\"https:\/\/roundteam.co\" rel=\"nofollow\"\u003eRoundTeam\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":2846552432,"id_str":"2846552432","name":"Glenn Silva","screen_name":"GlennSilva76","location":"hawaii","url":null,"description":"Christian, Constitutional Conservative, Pro 1A 2A and RF, It's Time To Unite And Take Our Country Back! #NeverTrump\r\n#UniteWithCruz #CruzCrew #CruzToVictory","protected":false,"verified":false,"followers_count":1981,"friends_count":2408,"listed_count":99,"favourites_count":1819,"statuses_count":38301,"created_at":"Wed Oct 08 07:34:50 +0000 2014","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/691834454868889601\/1gkIbY1C_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/691834454868889601\/1gkIbY1C_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/2846552432\/1453447926","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Apr 06 08:18:04 +0000 2016","id":717627418454966272,"id_str":"717627418454966272","text":"HONY Just Destroyed Donald Trump\u2019s Dream Of Becoming President - https:\/\/t.co\/8GIDVa76bZ Oooo, that's gonna hurt! #UniteWithCruz #NeverTrump","source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":4726275950,"id_str":"4726275950","name":"Wisegal1958","screen_name":"WisegalGranny","location":null,"url":null,"description":null,"protected":false,"verified":false,"followers_count":475,"friends_count":290,"listed_count":73,"favourites_count":8976,"statuses_count":10881,"created_at":"Fri Jan 08 02:36:28 +0000 2016","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"F5F8FA","profile_background_image_url":"","profile_background_image_url_https":"","profile_background_tile":false,"profile_link_color":"2B7BB9","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/715082668770242561\/ohjXvK85_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/715082668770242561\/ohjXvK85_normal.jpg","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":1,"favorite_count":0,"entities":{"hashtags":[{"text":"UniteWithCruz","indices":[114,128]},{"text":"NeverTrump","indices":[129,140]}],"urls":[{"url":"https:\/\/t.co\/8GIDVa76bZ","expanded_url":"http:\/\/www.parhlo.com\/hony-just-destroyed-trumps-dream-of-becoming-president\/?track=twb","display_url":"parhlo.com\/hony-just-dest\u2026","indices":[65,88]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"UniteWithCruz","indices":[133,140]},{"text":"NeverTrump","indices":[139,140]}],"urls":[{"url":"https:\/\/t.co\/8GIDVa76bZ","expanded_url":"http:\/\/www.parhlo.com\/hony-just-destroyed-trumps-dream-of-becoming-president\/?track=twb","display_url":"parhlo.com\/hony-just-dest\u2026","indices":[84,107]}],"user_mentions":[{"screen_name":"WisegalGranny","name":"Wisegal1958","id":4726275950,"id_str":"4726275950","indices":[3,17]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1459931635686"}
Please help me. Thank you.
I had the same error once.
Your script loads a JSON object at each line read, the issue might be that your JSON objects are not separated by a newline.
For instance if your file contains
json_oject1
json_oject2
then the two objects will be read whereas if the file contains
json_oject1 json_oject2
you will get an error.
Solution: add a newline when writing a new JSON object to the output file.
(related: https://stackoverflow.com/a/21058946/2314737)

Converting Google Ajax Search to a python dictionary

Im trying to write a script that gets google's ajax search results (For example: http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=filetype:pdf ) and download every file. Right now I'm stuck trying to convert the response to a python dictionary so its easier to move through.
import subprocess
import ast
subprocess.call("curl -G -d 'q=filetype:pdf&v=1.0' http://ajax.googleapis.com/ajax/services/search/web > output",stderr=subprocess.STDOUT,shell=True)
file = open('output','r')
contents = file.read()
output_dict = ast.literal_eval(contents)
print output_dict
When I run it, I get:
$ python script.py
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 2643 0 2643 0 0 15926 0 --:--:-- --:--:-- --:--:-- 26696
Traceback (most recent call last):
File "script.py", line 7, in <module>
output_dict = ast.literal_eval(contents)
File "/usr/lib/python2.7/ast.py", line 80, in literal_eval
return _convert(node_or_string)
File "/usr/lib/python2.7/ast.py", line 63, in _convert
in zip(node.keys, node.values))
File "/usr/lib/python2.7/ast.py", line 62, in <genexpr>
return dict((_convert(k), _convert(v)) for k, v
File "/usr/lib/python2.7/ast.py", line 79, in _convert
raise ValueError('malformed string')
ValueError: malformed string
The file looks like:
{"responseData": {"results":[{"GsearchResultClass":"GwebSearch",
"unescapedUrl":"http://www.foundationdb.com/AlphaLicenseAgreement.pdf",
"url":"http://www.foundationdb.com/AlphaLicenseAgreement.pdf",
"visibleUrl":"www.foundationdb.com",
"cacheUrl":"http://www.google.com/search?q\u003dcache:W7zhFlfbm6UJ:www.foundationdb.com",
"title":"FoundationDB Alpha Software Evaluation License Agreement",
"titleNoFormatting":"FoundationDB Alpha Software Evaluation License Agreement",
"content":"FOUNDATIONDB. ALPHA SOFTWARE EVALUATION LICENSE AGREEMENT. PLEASE READ CAREFULLY THE TERMS OF THIS ALPHA SOFTWARE \u003cb\u003e...\u003c/b\u003e",
"fileFormat":"PDF/Adobe Acrobat"
},
{"GsearchResultClass":"GwebSearch",
"unescapedUrl":"https://subreg.cz/registration_agreement.pdf",
"url":"https://subreg.cz/registration_agreement.pdf",
"visibleUrl":"subreg.cz",
"cacheUrl":"http://www.google.com/search?q\u003dcache:ODtRmQsiHD0J:subreg.cz",
"title":"Registration Agreement",
"titleNoFormatting":"Registration Agreement",
"content":"Registration Agreement. In order to complete the registration process you must read and agree to be bound by all terms and conditions herein. TERMS AND \u003cb\u003e...\u003c/b\u003e",
"fileFormat":"PDF/Adobe Acrobat"
},
{"GsearchResultClass":"GwebSearch",
"unescapedUrl":"http://supportdetails.com/export.pdf",
"url":"http://supportdetails.com/export.pdf",
"visibleUrl":"supportdetails.com",
"cacheUrl":"http://www.google.com/search?q\u003dcache:h0LvxrTTKzIJ:supportdetails.com",
"title":"Export PDF - Support Details",
"titleNoFormatting":"Export PDF - Support Details",
"content":"",
"fileFormat":"PDF/Adobe Acrobat"
},
{"GsearchResultClass":"GwebSearch",
"unescapedUrl":"http://www.fws.gov/le/pdf/travelpetbird.pdf",
"url":"http://www.fws.gov/le/pdf/travelpetbird.pdf",
"visibleUrl":"www.fws.gov",
"cacheUrl":"",
"title":"pet bird",
"titleNoFormatting":"pet bird",
"content":"U.S. Fish \u0026amp; Wildlife Service. Traveling Abroad with. Your Pet Bird. The Wild Bird Conservation Act (Act), a significant step in international conservation efforts to \u003cb\u003e...\u003c/b\u003e",
"fileFormat":"PDF/Adobe Acrobat"
}],
"cursor":{"resultCount":"72,800,000",
"pages":[{"start":"0","label":1},
{"start":"4","label":2},
{"start":"8","label":3},
{"start":"12","label":4},
{"start":"16","label":5},
{"start":"20","label":6},
{"start":"24","label":7},
{"start":"28","label":8}],
"estimatedResultCount":"72800000",
"currentPageIndex":0,
"moreResultsUrl":"http://www.google.com/search?oe\u003dutf8\u0026ie\u003dutf8\u0026source\u003duds\u0026start\u003d0\u0026hl\u003den\u0026q\u003dfiletype:pdf","searchResultTime":"0.04"
}
},
"responseDetails": null,
"responseStatus": 200
}
God that took forever to format
Google returns JSON, so use the json module instead of the ast module you are using now.
file = open('output','r')
output_dict = json.load(file)
You may also want to study the urllib2 module to load the URL response instead of relying on curl.

Categories

Resources