How to output a live JSON feed in Python 3? - python

I am using Python 3 to access a live JSON feed from http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_day.geojson. This is the code:
try:
# For Py 3.0+
from urllib.request import urlopen
except ImportError:
# For Py 2
from urllib2 import urlopen
import json
def printResults(data):
# Use the json module to load the string data into a dictionary
theJSON = json.loads(data) #pass JSON data into a dictionary
# now we can access the contents of the JSON like any other Python object
if "title" in theJSON["metadata"]:
print (theJSON["metadata"]["title"])
def main():
# JSON feed of earthquake activity larger than 2.5 in the past 25 hours
urlData = "http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_day.geojson"
#open url and read contents
webUrl = urlopen(urlData)
print (webUrl.getcode())
if (webUrl.getcode() == 200):
data = webUrl.read()
#print results
printResults(data)
else:
print ("Received an error from server " + str(webUrl.getcode()))
if __name__ == "__main__":
main()
I get the following output:
Traceback (most recent call last):
File "<string>", line 420, in run_nodebug
File "C:\Users\modar\Desktop\jsondata_finished.py", line 56, in <module>
File "C:\Users\modar\Desktop\jsondata_finished.py", line 50, in main
else:
File "C:\Users\modar\jsondata_finished.py", line 13, in printResults
if "title" in theJSON["metadata"]:
File "C:\Python33\lib\json\__init__.py", line 319, in loads
return _default_decoder.decode(s)
File "C:\Python33\lib\json\decoder.py", line 352, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
TypeError: can't use a string pattern on a bytes-like object
How can I fix this? An explanation as to what went wrong would also be great. Thanks in advance.

With the requests library, linked to in my comment above, your code becomes:
quake_data = requests.get('http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_day.geojson').json()
print(quake_data['metadata']['title'])
I do hope it helps...

Related

Error while writing API result to another JSON file

I am working on AZURE Cognitive API Search. While getting the result from API, I want to write it into a new JSON File. I tried to access the analyse_result variable with the line but it does not work. It shows that the object is not JSON Serializable. My code is-
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials
from array import array
import os
from PIL import Image
import sys
import time
import json
import csv
subscription_key = ""
endpoint = ""
computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
def azure_ocr_api(): #image_url
local_image_url = r"E:\Bank of Baroda\BOB IMAGE\Cheque309086.jpeg"
# read_response = computervision_client.read_in_stream(open("./Images/" + image_url,'rb'), raw=True)
read_response = computervision_client.read_in_stream(open(local_image_url,'rb'), raw=True)
# Get the operation location (URL with an ID at the end) from the response
read_operation_location = read_response.headers["Operation-Location"]
# Grab the ID from the URL
operation_id = read_operation_location.split("/")[-1]
# Call the "GET" API and wait for it to retrieve the results
while True:
read_result = computervision_client.get_read_result(operation_id)
if read_result.status not in ['notStarted', 'running']:
break
time.sleep(1)
list = []
if read_result.status == OperationStatusCodes.succeeded:
for text_result in read_result.analyze_result.read_results:
for line in text_result.lines:
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(line, f, ensure_ascii=False, indent=4)
# print(list)
# pass
# return list
azure_ocr_api()
print("End of Computer Vision quickstart.")
The code shows a error like this -
Traceback (most recent call last):
File "e:\Bank of Baroda\m.py", line 44, in <module>
azure_ocr_api()
File "e:\Bank of Baroda\m.py", line 40, in azure_ocr_api
json.dump(line, f, ensure_ascii=False, indent=4)
File "C:\Users\Clasher\anaconda3\lib\json\__init__.py", line 179, in dump
for chunk in iterable:
File "C:\Users\Clasher\anaconda3\lib\json\encoder.py", line 438, in _iterencode
o = _default(o)
File "C:\Users\Clasher\anaconda3\lib\json\encoder.py", line 179, in default
TypeError: Object of type Line is not JSON serializable
Please help.

Printing the content of URLs in a text file, from a text file which contains a list of URLs in python

I have a text file which contains a list of URLs and I am willing to print the contents of the URL in another text file, along with the URL as the header. I have used this project file https://pypi.org/project/Wikipedia-API/ to extract the content, but I would have to enter the link one after another, which I do not want to delve into, since my list is huge, with at least 3000 links per text file.
Can anyone help me with this, it would be highly appreciated.
EDIT:
I have tried this in the following way, but there is no content in the output txt file.
import urllib
import datetime as dt
from datetime import datetime
import time
linklist = []
with open ("test.txt", 'r', encoding = 'utf=8') as wikitxt :
#content = wikitxt.read().splitlines()
for i in wikitxt:
linklist.append (i)
output = open('Wikipedia_content.txt', 'w', encoding='utf-8')
startTime = time.time()
endTime = time.time()
runTime = endTime - startTime
print("Runtime is %3f seconds" % runTime)
Here is the txt file that I have used https://pastebin.com/Y4bwsHGB , and this is the text file that I need to use : https://pastebin.com/SXDAu8jV.
Thanks in advance.
PROBLEM:
Traceback (most recent call last):
File "C:/Users/suva_/Desktop/Project specification/data/test2.py", line 13, in <module>
output_file.write((urlopen(link).read()))
File "D:\Python 36\lib\urllib\request.py", line 228, in urlopen
return opener.open(url, data, timeout)
File "D:\Python 36\lib\urllib\request.py", line 531, in open
response = self._open(req, data)
File "D:\Python 36\lib\urllib\request.py", line 554, in _open
'unknown_open', req)
File "D:\Python 36\lib\urllib\request.py", line 509, in _call_chain
result = func(*args)
File "D:\Python 36\lib\urllib\request.py", line 1389, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib.error.URLError: <urlopen error unknown url type: https>
FINAL FIX:
import urllib
import datetime as dt
from datetime import datetime
import requests
import time
import re
import html2text
startTime = time.time()
def text_opener():
linklist=[]
with open ("test.txt", 'r', encoding = 'utf=8') as wikitxt :
#content = wikitxt.read().splitlines()
for i in wikitxt:
try:
linklist.append(i.strip())
except UnicodeEncodeError as enror:
linklist.append ("")
return linklist
linklist = text_opener() # put the content in a list and then opened the text
'''
This is a string of characters which I wanted to remove from the URL content
rejectedChar = list('!"#$%&\'()*+,-./:;<=>?#[\\]^_`{|}~0123456789')
rejectedChar.append("\t")
special="\t"
regexWords = r"[\w']+"
'''
'''STOPWORDS LIST WHICH CONTAINS A BUNCH OF WORDS WHICH I DON"T NEED TO BE PRINTED--- ONLY FOR LARGE FILES
#stopwords = []
#with open('stopwords.txt', 'r', encoding='utf-8') as inFile:
# for i in inFile:
# stopwords.append(i.strip())
'''
content = ""
count = 0
for i in linklist:
print(count," ",i.encode('utf-8'))
count+=1
try:
f = urllib.request.urlopen(i).read()
content+=str(f)
except Exception as e:
continue
#print((linklist[0:4000]).encode('utf-8'))
#combinedstops= rejectedChar+stopwords # combining them together
#for item in combinedstops:
#content=content.replace(item,"") # now this items are removed from the
#content
def output_file (content):
with open('June_wikipedia_content.txt', 'w', encoding = 'utf-8') as output:
output.write(str(content))
## try:
## output_file (content)
## except UnicodeEncodeError as enror:
## print ("Got lost in the game")
#sky=open("sky.txt",'w')
#sky.write(str(content))
output_file (content)
#print("hahahahahaha",stopwords)
#for i in content:
# i = re.findall(regexWords, i)
# i = [i for i in i if i in stopwords]
startTime = time.time()
endTime = time.time()
runTime = endTime - startTime
print("Runtime is %3f seconds" % runTime)
You can use the following function to open the text file and store all the links in a list:
with open('links.txt') as f:
content = f.read().splitlines()
The variable content is a list with each element containing the string associated with a URL. This will only work though if your links.txt has the URL's arranged line by line i.e:
www.google.co.in
www.wikipedia.co.in
www.youtube.co.in
Once you get this list you can iterate through it with a simple for loop and do what you desire.
If you want a more detailed answer I suggest posting an example text file of the links.
EDIT :
This works but it dumps the whole data into the file. The data is not formatted correctly. Is this what you need ?
from urllib.request import urlopen
with open('links.txt') as f:
content = f.read().splitlines()
with open('Wikipedia_content.txt', 'w') as output_file:
for link in content :
output_file.write(link)
output_file.write((urlopen(link).read()))

unable to parse json content with error `expecting property name enclosed`

I am fetching from database a text that is in a json format. Basically dumping an api content from a website to a db with TEXT field.
I can see the json content has no issues/errors, but when I do ..
try:
get_all = db.query("SELECT id, name, api_content FROM _books")
_result = get_all.fetchall()
except Exception as e:
print("[e::line-163] ", e)
try:
for r in _result:
api_content = r[2]
j = json.loads(api_content)
print('names, ', j['names'])
I get this error .
Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
The api_content has no issues, it is a well-formed json content.
I don't think you have a valid JSON in api_content. Judging by what the error is about, your properties (keys) are enclosed in single quotes but need to be enclosed in double quotes to be a valid JSON.
Here is what is happening:
>>> import json
>>> json.loads('{"key": "value"}') # <- VALID JSON
{'key': 'value'}
>>> json.loads("{'key': 'value'}") # <- INVALID JSON
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/json/__init__.py", line 319, in loads
return _default_decoder.decode(s)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/json/decoder.py", line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/json/decoder.py", line 355, in raw_decode
obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
In other words, you should revise the way you dump the API contents into the database. It looks like you are just writing string representations of Python dictionary objects instead of using json.dumps().
If you cannot change that, try using ast.literal_eval() to safely eval the api content:
from ast import literal_eval
for r in _result:
api_content = r[2]
j = literal_eval(api_content)
print('names, ', j['names'])

Parsing XML using json raises ValueError

I'm trying to parse a XML file using xml ElementTree and json
from xml.etree import ElementTree as et
import json
def parse_file(file_name):
tree = et.ElementTree()
npcs = {}
for npc in tree.parse(file_name):
quests = []
for quest in npc:
quest_name = quest.attrib['name']
stages = []
for i, stage in enumerate(quest):
next_stage, choice, npc_condition = None, None, None
for key, val in stage.attrib.items():
val = json.loads(val)
if key == 'choices':
choice = val
elif key == 'next_stage':
next_stage = val
elif key == 'ncp_condition':
npc_condition = {stage.attrib['npc_name']: val}
stages.append([i, next_stage, choice, npc_condition])
quests.append( {quest_name:stages})
npcs[npc.attrib['name']] = quests
return npcs
The XML file:
<?xml version="1.0" encoding="utf-8"?>
<npcs>
<npc name="NPC NAME">
<quest0 name="Quest Name here">
<stage0 choices='{"Option1":1, "Option1":2}'>
<text>text1</text>
</stage0>
<stage1 next_stage="[3,4]">
<text>text2</text>
</stage1>
<stage3 npc_name="other_npc_name" ncp_condition='{"some_condition":false}' next_stage="[3, 4]">
<text>text3</text>
</stage3>
</quest0>
</npc>
</npcs>
But I'm having trouble with this bit:
<stage3 npc_name="other_npc_name" ncp_condition='{"some_condition":false}' next_stage="[3, 4]">
Traceback:
Traceback (most recent call last):
File "C:/.../test2.py", line 28, in <module>
parse_file('quests.xml')
File "C:/.../test2.py", line 15, in parse_file
val = json.loads(val)
File "C:\Python27\lib\json\__init__.py", line 338, in loads
return _default_decoder.decode(s)
File "C:\Python27\lib\json\decoder.py", line 366, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Python27\lib\json\decoder.py", line 384, in raw_decode
raise ValueError("No JSON object could be decoded")
ValueError: No JSON object could be decoded
It raises this error in the line val = json.loads(val) when key="npc_name" and val="other_npc_name".
What's wrong with that? It didn't raise any error when name="some string", but it does when npc_name="some string".
I noticed that if I change "other_npc_name" to '"other_npc_name"' it doesn't complain, but this seem a bit hackish to me
JSON is a way to store data structures - thus it can only decode said data structures.
When you try to get JSON to decode something like this:
other_npc_name
JSON can't match this to any valid data type. However, if this is wrapped in quotation marks:
"other_npc_name"
JSON recognizes this as a String (as per the JSON spec, that is how a string is defined).
And this is what is happening in your script:
import json
print json.loads("other_npc_name") #throws error
print json.loads('"other_npc_name"') #returns "other_npc_name" as a Unicode string
Thus, it may seem 'hackish' to wrap the string this way, however, this is really the only way for JSON to decode it.
One potential suggestion is that if the npc_name attribute in XML is always a string, then pull it out as a string instead of trying to decode it as a JSON object.

reading large JSON file in Python (raw_decode)

I am trying to read in large JSON file (data.json) in Python. Because the JSON file has multiple JSON objects, and multiple dictionaries will be created in Python(the number of dictionaries are unknown), I used decoder.raw_decode() and generator.
The following is the code:
import json
import pprint
import io
import pprint
def parse():
with open('data.json',encoding='utf-8') as jfile:
try:
while True:
decoder = json.JSONDecoder()
obj, idx = decoder.raw_decode(jfile)
yield obj
except ValueError as e:
print(e)
pass
else:
print("aha")
def main():
imputd=parse()
if imputd:
while True:
try:
print(str(next(imputd)).readlines())
except StopIteration as e:
print(e)
break
main()
I get the error:
Traceback (most recent call last):
File "H:\Document\Python\j10.py", line 57, in <module>
main()
File "H:\Document\Python\j10.py", line 36, in main
print(str(next(imputd)).readlines())
File "H:\Document\Python\j10.py", line 21, in parse
obj, idx = decoder.raw_decode(jfile)
File "C:\Python34\lib\json\decoder.py", line 360, in raw_decode
obj, end = self.scan_once(s, idx)
TypeError: first argument must be a string, not _io.TextIOWrapper
I edited code based on Martijn's answer:
import json
import io
file=open('data.json.txt')
def readin():
return file.read(2000)
def parse():
decoder = json.JSONDecoder()
buffer = ''
for chunk in iter(readin, ''):
buffer += chunk
while buffer:
try:
result, index = decoder.raw_decode(buffer)
yield result
buffer = buffer[index:]
except ValueError:
# Not enough data to decode, read more
break
def main():
imputd=parse()
if imputd:
while True:
try:
print(str(next(imputd)).readlines())
except StopIteration as e:
print(e)
break
main()
and I get an UnicodeError:
Traceback (most recent call last):
File "H:\Document\Python\j11.py", line 35, in <module>
main()
File "H:\Document\Python\j11.py", line 30, in main
print(str(next(imputd)).readlines())
File "H:\Document\Python\j11.py", line 14, in parse
for chunk in iter(readin, ''):
File "H:\Document\Python\j11.py", line 8, in readin
return file.read(2000)
File "C:\Python34\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 4217: character maps to <undefined>
You are passing in the file object, but decoder.raw_decode() only takes text data. You need to do the reading yourself:
obj, idx = decoder.raw_decode(jfile.read())
You are then yielding Python objects created from the JSON data, so your .readlines() call in your main() function loop will also fail.
You are not using raw_decode() correctly, however. You are yourself responsible for feeding it chunks of text, it'll not read that text from the file for you. If you wanted to handle the file in chunks, and there are no clear delimiters between the JSON entries, you'll be forced to read the file in blocks:
decoder = json.JSONDecoder()
buffer = ''
for chunk in iter(partial(jfile.read, buffersize), ''):
buffer += chunk
while buffer:
try:
result, index = decoder.raw_decode(buffer)
yield result
buffer = buffer[index:]
except ValueError:
# Not enough data to decode, read more
break
This will still yield completely decoded objects; if your file is one long JSON object (like one top-level list or dictionary) then this'll not yield the contents of that object one by one; it'll still read the whole object before yielding.

Categories

Resources