How to convert Avro file to CSV file in python? - python

The process I am following is -
Converting Avro to JSON
Then converting JSON to CSV
Is there any direct way to convert the Avro file to CSV?

This is how I converted an avro file to csv:
from fastavro import reader
import csv
head = True
count = 0
f = csv.writer(open("test.csv", "w+"))
with open('abc.avro', 'rb') as fo:
avro_reader = reader(fo)
for emp in avro_reader:
#print(emp)
if head == True:
header = emp.keys()
f.writerow(header)
head = False
count += 1
f.writerow(emp.values())
print(count)

Related

Parsing JSON into CSV in Python

I'm trying to parse JSON files into CSV. I've been able to get the headers of the JSON file to be output into the CSV but I can't figure out how to get the data into the file.
# Python program to convert
# JSON file to CSV
import json
import csv
# Opening JSON file and loading the data
# into the variable data
with open('test1.json') as json_file:
data = json.load(json_file)
for i in range(len(data)):
training_data = data[i]['profile']
# now we will open a file for writing
data_file = open('data_file.csv', 'w')
# create the csv writer object
csv_writer = csv.writer(data_file)
# Counter variable used for writing
# headers to the CSV file
count = 0
#type(training_data)
for profile in training_data:
if count == 0:
header = training_data.keys()
csv_writer.writerow(header)
count += 1
csv_writer.writerow(training_data.values())
data_file.close()
This is the file im trying to parse
https://textdoc.co/OuphoV5saiwWYS8g
If someone could help me out I'd be eternally grateful
would something like this work for you?
import pandas as pd
df = pd.read_json(json_file)
df.to_csv('data_file.csv')
or for more complex nested json, you may have to load as a dictionary and manipulate:
data = json.loads(json_str)
data_transformed = [i for i in data['data']]
df = pd.DataFrame(data_transformed )
df.to_csv('data_file.csv')

How can I convert JSON format text to dataframe?

I am trying to convert below JSON format text to pandas or spark data frame, but it is giving below error.
ERROR: JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Python CODE:
# import pandas to read json file
import json
path = "sample.json"
with open(path, 'r') as myfile:
data=myfile.read()
data = data.replace('\t','')
data = data.replace('\n','')
data = data.replace(',}','}')
data = data.replace(',]',']')
obj = json.loads(data)
JSON file format
Output of data after reading .json file by using open function
How can I convert above text as a data frame?
I got, I added few lines of code
path = "sample.json"
with open(path, 'r') as myfile:
data=myfile.read()
data = data.replace('\t','')
data = data.replace('\n','')
data = data.replace(',}','}')
data = data.replace(',]',']')
data = data.replace("null", "''")
liss = []
data1 = data[1:-1]
data2 = data1.split("},")
for i in data2:
last_value = i[len(i)-1]
if last_value != "}":
new_text = i+"}"
liss.append(new_text)
else:
new_text = i
liss.append(new_text)
sample_df = pd.DataFrame({"Col1":liss})
sample_df["Col1"] = sample_df["Col1"].apply(lambda x : dict(eval(x)) )
df3 = sample_df["Col1"].apply(pd.Series )
df3
I think you can read the json and save it in a dictionary.
Once you have this dictionary you can create a spark dataframe with the following line of code
df = spark.createDataFrame(dict)

Converting a large CSV file to multiple JSON files using Python

I am currently using the following code to convert a large CSV file to a JSON file.
import csv
import json
def csv_to_json(csvFilePath, jsonFilePath):
jsonArray = []
with open(csvFilePath, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for row in csvReader:
jsonArray.append(row)
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonString = json.dumps(jsonArray, indent=4)
jsonf.write(jsonString)
csvFilePath = r'test_data.csv'
jsonFilePath = r'test_data.json'
csv_to_json(csvFilePath, jsonFilePath)
This code works fine and I am able to convert the CSV to JSON without any issues. However, as the CSV file contains 600,000+ rows and hence as many items in my JSON, it has become very difficult to manage the JSON file.
I would like to modify my above code such that for every 5000 rows of the CSV, the data is written into a new JSON file. Ideally, I would be having 120 (600,000/5000) JSON files in this case.
How can I do the same?
Split up your read\write methods and add a simple threshold:
JSON_ENTRIES_THRESHOLD = 5000 # modify to whatever you see suitable
def write_json(json_array, filename):
with open(filename, 'w', encoding='utf-8') as jsonf:
json.dump(json_array, jsonf) # note the usage of .dump directly to a file descriptor
def csv_to_json(csvFilePath, jsonFilePath):
jsonArray = []
with open(csvFilePath, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
filename_index = 0
for row in csvReader:
jsonArray.append(row)
if len(jsonArray) >= JSON_ENTRIES_THRESHOLD:
# if we reached the treshold, write out
write_json(jsonArray, f"jsonFilePath-{filename_index}.json")
filename_index += 1
jsonArray = []
# Finally, write out the remainder
write_json(jsonArray, f"jsonFilePath-{filename_index}.json")

Converting JSON to CSV, CSV is empty

I'm attempting to convert yelps data set that is in JSON to a csv format. The new csv file that is created is empty.
I've tried different ways to iterate through the JSON but they all give me a zero bytes file.
The json file looks like this:
{"business_id":"1SWheh84yJXfytovILXOAQ","name":"Arizona Biltmore Golf Club","address":"2818 E Camino Acequia Drive","city":"Phoenix","state":"AZ","postal_code":"85016","latitude":33.5221425,"longitude":-112.0184807,"stars":3.0,"review_count":5,"is_open":0,"attributes":{"GoodForKids":"False"},"categories":"Golf, Active Life","hours":null}
import json
import csv
infile = open("business.json","r")
outfile = open("business2.csv","w")
data = json.load(infile)
infile.close()
out = csv.writer(outfile)
out.writerow(data[0].keys())
for row in data:
out.writerow(row.values())
I get an "extra data" message when the code runs. The new business2 csv file is empty and the size is zero bytes.
if you JSON has only one row.. then try this
infile = open("business.json","r")
outfile = open("business2.csv","w")
data = json.load(infile)
infile.close()
out = csv.writer(outfile)
#print(data.keys())
out.writerow(data.keys())
out.writerow(data.values())
Hi Please try the below code, by using with command the file access will automatically get closed when the control moves out of scope of with
infile = open("business.json","r")
outfile = open("business2.csv","w")
data = json.load(infile)
infile.close()
headers = list(data.keys())
values = list(data.values())
with open("business2.csv","w") as outfile:
out = csv.writer(outfile)
out.writerow(headers)
out.writerow(values)
You need to use with to close file.
import json
import csv
infile = open("business.json","r")
data = json.load(infile)
infile.close()
with open("business2.csv","w") as outfile:
out = csv.writer(outfile)
out.writerow(list(data.keys()))
out.writerow(list(data.values()))

Edit CSV file in python which reads values from another json file in python

I wanted to edit a csv file which reads the value from one of my another json file in python 2.7
my csv is : a.csv
a,b,c,d
,10,12,14
,11,14,15
my json file is a.json
{"a":20}
i want my where the column 'a' will try to match in json file. if their is a match. it should copy that value from json and paste it to my csv file and the final output of my csv file should be looks like this.
a,b,c,d
20,10,12,14
20,11,14,15
Till now I what I have tried is
fileCSV = open('a.csv', 'a')
fileJSON = open('a.json', 'r')
jsonData = fileJSON.json()
for k in range(jsonData):
for i in csvRow:
for j in jsonData.keys():
if i == j:
if self.count == 0:
self.data = jsonData[j]
self.count = 1
else:
self.data = self.data + "," + jsonData[j]
self.count = 0
fileCSV.write(self.data)
fileCSV.write("\n")
k += 1
fileCSV.close()
print("File created successfully")
I will be really thankful if anyone can help me for this.
please ignore any syntactical and indentation error.
Thank You.
Some basic string parsing will get you here.. I wrote a script which works for the simple scenario which you refer to.
check if this solves your problem:
import json
from collections import OrderedDict
def list_to_csv(listdat):
csv = ""
for val in listdat:
csv = csv+","+str(val)
return csv[1:]
lines = []
csvfile = "csvfile.csv"
outcsvfile = "outcsvfile.csv"
jsonfile = "jsonfile.json"
with open(csvfile, encoding='UTF-8') as a_file:
for line in a_file:
lines.append(line.strip())
columns = lines[0].split(",")
data = lines[1:]
whole_data = []
for row in data:
fields = row.split(",")
i = 0
rowData = OrderedDict()
for column in columns:
rowData[columns[i]] = fields[i]
i += 1
whole_data.append(rowData)
with open(jsonfile) as json_file:
jsondata = json.load(json_file)
keys = list(jsondata.keys())
for key in keys:
value = jsondata[key]
for each_row in whole_data:
each_row[key] = value
with open(outcsvfile, mode='w', encoding='UTF-8') as b_file:
b_file.write(list_to_csv(columns)+'\n')
for row_data in whole_data:
row_list = []
for ecolumn in columns:
row_list.append(row_data.get(ecolumn))
b_file.write(list_to_csv(row_list)+'\n')
CSV output is not written to the source file but to a different file.
The output file is also always truncated and written, hence the 'w' mode.
I would recommend using csv.DictReader and csv.DictWriter classes which will read into and out of python dicts. This would make it easier to modify the dict values that you read in from the JSON file.

Categories

Resources