I have the following code which utilises boto3 for AWS.
import boto3
from trp import Document
# Document
s3BucketName = "bucket"
documentName = "doc.png"
# Amazon Textract client
textract = boto3.client('textract')
# Call Amazon Textract
response = textract.analyze_document(
Document={
'S3Object': {
'Bucket': s3BucketName,
'Name': documentName
}
},
FeatureTypes=["FORMS"])
#print(response)
doc = Document(response)
for page in doc.pages:
# Print fields
print("Fields:")
for field in page.form.fields:
print("Key: {}, Value: {}".format(field.key, field.value))
I am trying to save the output of that function as dict, JSON, or CSV, but I am not an experienced python programmer yet.
I tried this:
key_map = {}
filepath = 'output.txt'
with open(filepath) as fp:
line = fp.readline()
cnt = 1
while line:
for page in doc.pages:
# Print fields
print("Fields:")
for field in page.form.fields:
#print("Key: {}, Value: {}".format(field.key, field.value))
key_map[str(field.key, field.value)] = cnt
line = fp.readline()
cnt +=1
But I don't think that this solution is working. Any tips on how to save the output of that for loop as a JSON?
If you want as a csv output, you can use csv module as:
import csv
doc = Document(response)
with open('aws_doc.csv', mode='w') as aws_field_file:
field_write = csv.writer(aws_field_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for page in doc.pages:
for field in page.form.fields:
# This will write it as your <key>, <value>
field_write.writerow([field.key, field.value])
In case you want headers in the file you can also use the DictWriter which would make it easy for you to just pass a dictionary:
https://docs.python.org/3.4/library/csv.html#csv.DictWriter
Related
#This code currently handles creating one csv file#
I want to make multiple csv files by going through one csv file that has repo names and using the rest api in github to gather information
new_url = pd.read_csv("Ovio.csv")
owner = new_url['Repo_Name'].iloc[682]
url = 'https://api.github.com/repos/{owner}/contributors'.format(owner=owner)
headers = {
"Accept" : "application/json",
"Content-Type": "application/json"
}
response = requests.request("GET",url,headers = headers,data = [])
data = response.json()
ourdata = []
csvheader = ['login','html_url','contributions']
for x in data:
listing = [x['login'],x['html_url'], x['contributions']]
ourdata.append(listing)
with open("api.csv", 'w', encoding = 'UTF8', newline ='') as file:
writer = csv.writer(file)
writer.writerow(csvheader)
writer.writerows(ourdata)
os.system('open api.csv')
How can the JSON output be formatting in a way that doesn't include the \n text, and instead shows these as new lines as intended? This is what the saved output file looks like:
But, this is how it looks when I use print, which is what it should look like:
import requests
import json
def get_all_time_entries():
url_address = "***"
headers = {
"Authorization": "***",
"api-version": "2020-01-31"
}
# find out total number of pages
r = requests.get(url=url_address, headers=headers).json()
total_pages = 605
# results will be appended to this list
all_time_entries = []
# loop through all pages and return JSON object
for page in range(1, total_pages):
url = "***"+str(page)
response = requests.get(url=url, headers=headers).json()
all_time_entries.append(response)
page += 1
# prettify JSON
data = json.dumps(all_time_entries, sort_keys=True, indent=4)
return data
#print(get_all_time_entries())
with open('appointmentsHistory.json', 'w', encoding='utf-8') as f:
# note that I use dump method, not dumps
json.dump(get_all_time_entries(), f, sort_keys=True, indent=4)
json.dumps() transforms the data dictionary into a string, and then json.dump() writes the JSON representation of that string to the file.
To resolve, remove json.dumps() from the get_all_time_entries() method. json.dump() will take the dictionary in directly and transform it into a JSON string for you.
import requests
import json
def get_all_time_entries():
url_address = "***"
headers = {
"Authorization": "***",
"api-version": "2020-01-31"
}
# find out total number of pages
r = requests.get(url=url_address, headers=headers).json()
total_pages = 605
# results will be appended to this list
all_time_entries = []
# loop through all pages and return JSON object
for page in range(1, total_pages):
url = "***"+str(page)
response = requests.get(url=url, headers=headers).json()
all_time_entries.append(response)
page += 1
return data
with open('appointmentsHistory.json', 'w', encoding='utf-8') as f:
# note that I use dump method, not dumps
json.dump(get_all_time_entries(), f, sort_keys=True, indent=4)
json.dump() takes an object, you seem to be passing it a JSON-like string.
I am loading json from files using the code:
file = 'file_name'
obj_list = []
with open(file) as f:
for json_obj in f:
obj_list.append(loads(json_obj))
I get error:
JSONDecodeError: Extra data: line 1 column 21 (char 20)
All my files look like this but much larger.
{"some":"property2"}{"some":"property"}{"some":"property3"}
Is there a way to parse this in python for a large number of files?
Your json is not valid . It should be something like this
[{'some': 'property2'}, {'some': 'property'}, {'some': 'property3'}]
import json
with open(file, 'r') as f:
json_str = f'[{f.read()}]'
obj_list = json.loads(json_str)
Reading the content, adding [] to make it valid json, and then loading it with the json package.
I'm collecting tweets from Twitter's API. My code is returning a string which I have transformed into a dictionary. I am looking to create a CSV where I store this data by creating columns. I have attached an image of my CSV currently looks like.
current CSV image:
.
What suggestions do you suggest for creating something like the following;
desired outcome:
with open('dict.csv', 'w') as csv_file:
writer = csv.writer(csv_file)
for key, value in y.items():
writer.writerow([key, value])
#with open('my_file.csv', 'w') as f:
# [f.write('{0},{1}\n'.format(key, value)) for key, value in y.items()]
Full code:
import requests
import os
import json
import pandas as pd
import csv
import sys
import time
bearer_token = "insert here"
search_url = "https://api.twitter.com/2/tweets/search/all"
query_params = {'query': '(Johnson&Johnson) -is:retweet -is:verified -baby -lotion -shampoo','tweet.fields': 'text', 'tweet.fields':'created_at', 'start_time':'2021-01-20T00:00:01.000Z', 'end_time':'2021-02-17T23:30:00.000Z'}
#query_params={'query':'(vaccine OR vaccinated) -is:retweet -is:verified -RT -baby -lotion -shampoo&start_time=2021-01-20T00:00:01.000Z&end_time=2021-02-20T23:30:00.000Z&max_results=10&tweet.fields=author_id,conversation_id,created_at,geo,id,lang,source,text&expansions=author_id&place.fields=full_name&user.fields=created_at,description,entities,id,location,name,url,username'}
def create_headers(bearer_token):
headers = {"Authorization": "Bearer {}".format(bearer_token)}
return headers
def connect_to_endpoint(url, headers, params):
response = requests.request("GET", search_url, headers=headers, params=params)
print('first:', response.status_code)
if response.status_code != 200:
raise Exception(response.status_code, response.text)
return response.json()
def main():
headers = create_headers(bearer_token)
json_response = connect_to_endpoint(search_url, headers, query_params)
x = json.dumps(json_response,sort_keys=True)
y = json.loads(x)
if __name__ == "__main__":
main()
Try Using DictWriter,
import csv
with open(csv_file, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
writer.writeheader()
for data in dict_data:
writer.writerow(data)
For more info refer below link,
How to save a Python Dictionary to a CSV File?
I scraped a site for data and I was able to print the desired output with json format containing only value but what i actually needed is to get the data with both key and value pair and save it into output.json format so I can insert into my django database. Here is what I have done so far
import requests
import json
URL ='http://tfda.go.tz/portal/en/trader_module/trader_module/getRegisteredDrugs_products'payload = "draw=1&columns%5B0%5D%5Bdata%5D=no&columns%5B0%5D%5Bname%5D=&columns%5B0%5D%5Bsearchable%5D=True&columns%5B0%5D%5Borderable%5D=True&columns%5B0%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B0%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B1%5D%5Bdata%5D=certificate_no&columns%5B1%5D%5Bname%5D=&columns%5B1%5D%5Bsearchable%5D=True&columns%5B1%5D%5Borderable%5D=True&columns%5B1%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B1%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B2%5D%5Bdata%5D=brand_name&columns%5B2%5D%5Bname%5D=&columns%5B2%5D%5Bsearchable%5D=True&columns%5B2%5D%5Borderable%5D=True&columns%5B2%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B2%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B3%5D%5Bdata%5D=classification_name&columns%5B3%5D%5Bname%5D=&columns%5B3%5D%5Bsearchable%5D=True&columns%5B3%5D%5Borderable%5D=True&columns%5B3%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B3%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B4%5D%5Bdata%5D=common_name&columns%5B4%5D%5Bname%5D=&columns%5B4%5D%5Bsearchable%5D=True&columns%5B4%5D%5Borderable%5D=True&columns%5B4%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B4%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B5%5D%5Bdata%5D=dosage_form&columns%5B5%5D%5Bname%5D=&columns%5B5%5D%5Bsearchable%5D=True&columns%5B5%5D%5Borderable%5D=True&columns%5B5%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B5%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B6%5D%5Bdata%5D=product_strength&columns%5B6%5D%5Bname%5D=&columns%5B6%5D%5Bsearchable%5D=True&columns%5B6%5D%5Borderable%5D=True&columns%5B6%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B6%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B7%5D%5Bdata%5D=registrant&columns%5B7%5D%5Bname%5D=&columns%5B7%5D%5Bsearchable%5D=True&columns%5B7%5D%5Borderable%5D=True&columns%5B7%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B7%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B8%5D%5Bdata%5D=registrant_country&columns%5B8%5D%5Bname%5D=&columns%5B8%5D%5Bsearchable%5D=True&columns%5B8%5D%5Borderable%5D=True&columns%5B8%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B8%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B9%5D%5Bdata%5D=manufacturer&columns%5B9%5D%5Bname%5D=&columns%5B9%5D%5Bsearchable%5D=True&columns%5B9%5D%5Borderable%5D=True&columns%5B9%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B9%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B10%5D%5Bdata%5D=manufacturer_country&columns%5B10%5D%5Bname%5D=&columns%5B10%5D%5Bsearchable%5D=True&columns%5B10%5D%5Borderable%5D=True&columns%5B10%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B10%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B11%5D%5Bdata%5D=expiry_date&columns%5B11%5D%5Bname%5D=&columns%5B11%5D%5Bsearchable%5D=True&columns%5B11%5D%5Borderable%5D=True&columns%5B11%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B11%5D%5Bsearch%5D%5Bregex%5D=False&columns%5B12%5D%5Bdata%5D=id&columns%5B12%5D%5Bname%5D=&columns%5B12%5D%5Bsearchable%5D=True&columns%5B12%5D%5Borderable%5D=True&columns%5B12%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B12%5D%5Bsearch%5D%5Bregex%5D=False&order%5B0%5D%5Bcolumn%5D=0&order%5B0%5D%5Bdir%5D=asc&start=0&length=3911&search%5Bvalue%5D=&search%5Bregex%5D=False"
with requests.Session() as s:
s.headers={"User-Agent":"Mozilla/5.0"}
s.headers.update({'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'})
res = s.post(URL, data = payload)
for data in res.json()['data']:
serial = data['no']
certno = data['certificate_no']
brndname = data['brand_name']
clssification = data['classification_name']
common_name = data['common_name']
dosage_form = data['dosage_form']
expiry_date = data['expiry_date']
manufacturer = data['manufacturer']
manufacturer_country = data['manufacturer_country']
product_strength = data['product_strength']
registrant = data['registrant']
registrant_country = data['registrant_country']
output = (serial,certno,brndname,clssification,common_name,dosage_form,expiry_date,manufacturer, manufacturer_country,product_strength,registrant, registrant_country )
my_list = output
json_str = json.dumps(my_list)
print (json_str)
And here is my attached output screenshot
So how do I approach this?
Use json.dump
with open(path, 'w') as file:
[...]
json.dump(myPythonList, file)
file.write('\n')