fastavro - Convert json file into avro file

fastavro - Convert json file into avro file - python

A bit new to avro & python.
I am trying to do a simple conversion to avro using the fastavro library, as the speed of the native apache avro library is just a bit too slow.
I want to:
1.Take a json file
2. Convert the data to avro.
My problem is that it seems like my json isn't in the correct 'record' format to be converted to avro. I even tried putting my json into a string var and making it looks similar to the syntax they have on the site # https://fastavro.readthedocs.io/en/latest/writer.html:
{u'station': u'011990-99999', u'temp': 22, u'time': 1433270389},
{u'station': u'011990-99999', u'temp': -11, u'time': 1433273379},
{u'station': u'012650-99999', u'temp': 111, u'time': 1433275478},
Here is my code:
from fastavro import json_writer, parse_schema, writer
import json
key = "test.json"
schemaFileName = "test_schema.avsc"
with open(r'C:/Path/to/file' + schemaFileName) as sc:
w = json.load(sc)
schema = parse_schema(w)
with open(r'C:/Path/to/file/' + key) as js:
x=json.load(js)
with open('C:/Path/to/file/output.avro', 'wb') as out:
writer(out, schema,x, codec='deflate')
Here is what I get as output:
File "avropython.py", line 26, in <module>
writer(out, schema,x, codec='deflate')
File "fastavro\_write.pyx", line 608, in fastavro._write.writer
ValueError: "records" argument should be an iterable, not dict
My json file and schema, rspectively:
"joined": false,
"toward": {
"selection": "dress",
"near": true,
"shoulder": false,
"fine": -109780201.3804388,
"pet": {
"stood": "saddle",
"live": false,
"leather": false,
"tube": false,
"over": false,
"impossible": true
},
"higher": false
},
"wear": true,
"asleep": "door",
"connected": true,
"stairs": -1195512399.5000324
}
{
"name": "MyClass",
"type": "record",
"namespace": "com.acme.avro",
"fields": [
{
"name": "joined",
"type": "boolean"
},
{
"name": "toward",
"type": {
"name": "toward",
"type": "record",
"fields": [
{
"name": "selection",
"type": "string"
},
{
"name": "near",
"type": "boolean"
},
{
"name": "shoulder",
"type": "boolean"
},
{
"name": "fine",
"type": "float"
},
{
"name": "pet",
"type": {
"name": "pet",
"type": "record",
"fields": [
{
"name": "stood",
"type": "string"
},
{
"name": "live",
"type": "boolean"
},
{
"name": "leather",
"type": "boolean"
},
{
"name": "tube",
"type": "boolean"
},
{
"name": "over",
"type": "boolean"
},
{
"name": "impossible",
"type": "boolean"
}
]
}
},
{
"name": "higher",
"type": "boolean"
}
]
}
},
{
"name": "wear",
"type": "boolean"
},
{
"name": "asleep",
"type": "string"
},
{
"name": "connected",
"type": "boolean"
},
{
"name": "stairs",
"type": "float"
}
]
}
If anyone could help me out, it would be greatly appreciated!!

As mentioned in the error ValueError: "records" argument should be an iterable, not dict, the problem is that when you call writer, the argument for the records needs to be an iterable. One way to solve this is to change your last line to writer(out, schema, [x], codec='deflate')
Alternatively, there is a schemaless_writer that can be used to just write a single record: https://fastavro.readthedocs.io/en/latest/writer.html#fastavro._write_py.schemaless_writer

Related

Unable to write avro data to kafka using python

I'm using kafka kafka_2.11-0.11.0.2 and confluent version 3.3.0 for schema registry.
I have defined an avro schema as follows:
{
"namespace": "com.myntra.search",
"type": "record",
"name": "SearchDataIngestionObject",
"fields": [
{"name": "timestamp","type":"long"},
{"name": "brandList", "type":{ "type" : "array", "items" : "string" }},
{"name": "articleTypeList", "type":{ "type" : "array", "items" : "string" }},
{"name": "gender", "type":{ "type" : "array", "items" : "string" }},
{"name": "masterCategoryList", "type":{ "type" : "array", "items" : "string" }},
{"name": "subCategoryList", "type":{ "type" : "array", "items" : "string" }},
{"name": "quAlgo","type":{ "type" : "array", "items" : "string" }},
{"name": "colours", "type":{ "type" : "array", "items" : "string" }},
{"name": "isLandingPage", "type": "boolean"},
{"name": "isUserQuery", "type": "boolean"},
{"name": "isAutoSuggest", "type": "boolean"},
{"name": "userQuery", "type": "string"},
{"name": "correctedQuery", "type": "string"},
{"name": "completeSolrQuery", "type": "string"},
{"name": "atsaList", "type":{"type": "map", "values":{ "type" : "array", "items" : "string" }}},
{"name": "quMeta", "type": {"type": "map", "values": "string"}},
{"name": "requestId", "type": "string"}
]
}
And I'm trying to write some data to kafka as follows:
value = {
"timestamp": 1597399323000,
"brandList": ["brand_value"],
"articleTypeList": ["articleType_value"],
"gender": ["gender_value"],
"masterCategoryList": ["masterCategory_value"],
"subCategoryList": ["subCategory_value"],
"quAlgo": ["quAlgo_value"],
"colours": ["colours_value"],
"isLandingPage": False,
"isUserQuery": False,
"isAutoSuggest": False,
"userQuery": "userQuery_value",
"correctedQuery": "correctedQuery_value",
"completeSolrQuery": "completeSolrQuery_value",
"atsaList": {
"atsa_key1": ["atsa_value1"],
"atsa_key2": ["atsa_value2"],
"atsa_key3": ["atsa_value3"]
},
"quMeta": {
"quMeta_key1": "quMeta_value1",
"quMeta_key2": "quMeta_value2",
"quMeta_key3": "quMeta_value3"
},
"requestId": "requestId_value"
}
topic = "search"
key = str(uuid.uuid4())
producer.produce(topic=topic, key=key, value=value)
producer.flush()
But I'm getting the following error:
Traceback (most recent call last):
File "producer.py", line 61, in <module>
producer.produce(topic=topic, key=key, value=value)
File "/Library/Python/2.7/site-packages/confluent_kafka/avro/__init__.py", line 99, in produce
value = self._serializer.encode_record_with_schema(topic, value_schema, value)
File "/Library/Python/2.7/site-packages/confluent_kafka/avro/serializer/message_serializer.py", line 118, in encode_record_with_schema
return self.encode_record_with_schema_id(schema_id, record, is_key=is_key)
File "/Library/Python/2.7/site-packages/confluent_kafka/avro/serializer/message_serializer.py", line 152, in encode_record_with_schema_id
writer(record, outf)
File "/Library/Python/2.7/site-packages/confluent_kafka/avro/serializer/message_serializer.py", line 86, in <lambda>
return lambda record, fp: writer.write(record, avro.io.BinaryEncoder(fp))
File "/Library/Python/2.7/site-packages/avro/io.py", line 979, in write
raise AvroTypeException(self.writers_schema, datum)
avro.io.AvroTypeException: The datum {'quAlgo': ['quAlgo_value'], 'userQuery': 'userQuery_value', 'isAutoSuggest': False, 'isLandingPage': False, 'timestamp': 1597399323000, 'articleTypeList': ['articleType_value'], 'colours': ['colours_value'], 'correctedQuery': 'correctedQuery_value', 'quMeta': {'quMeta_key1': 'quMeta_value1', 'quMeta_key2': 'quMeta_value2', 'quMeta_key3': 'quMeta_value3'}, 'requestId': 'requestId_value', 'gender': ['gender_value'], 'isUserQuery': False, 'brandList': ['brand_value'], 'masterCategoryList': ['masterCategory_value'], 'subCategoryList': ['subCategory_value'], 'completeSolrQuery': 'completeSolrQuery_value', 'atsaList': {'atsa_key1': ['atsa_value1'], 'atsa_key2': ['atsa_value2'], 'atsa_key3': ['atsa_value3']}} is not an example of the schema {
"namespace": "com.myntra.search",
"type": "record",
"name": "SearchDataIngestionObject",
"fields": [
{
"type": "long",
"name": "timestamp"
},
{
"type": {
"items": "string",
"type": "array"
},
"name": "brandList"
},
{
"type": {
"items": "string",
"type": "array"
},
"name": "articleTypeList"
},
{
"type": {
"items": "string",
"type": "array"
},
"name": "gender"
},
{
"type": {
"items": "string",
"type": "array"
},
"name": "masterCategoryList"
},
{
"type": {
"items": "string",
"type": "array"
},
"name": "subCategoryList"
},
{
"type": {
"items": "string",
"type": "array"
},
"name": "quAlgo"
},
{
"type": {
"items": "string",
"type": "array"
},
"name": "colours"
},
{
"type": "boolean",
"name": "isLandingPage"
},
{
"type": "boolean",
"name": "isUserQuery"
},
{
"type": "boolean",
"name": "isAutoSuggest"
},
{
"type": "string",
"name": "userQuery"
},
{
"type": "string",
"name": "correctedQuery"
},
{
"type": "string",
"name": "completeSolrQuery"
},
{
"type": {
"values": {
"items": "string",
"type": "array"
},
"type": "map"
},
"name": "atsaList"
},
{
"type": {
"values": "string",
"type": "map"
},
"name": "quMeta"
},
{
"type": "string",
"name": "requestId"
}
]
}
I even trying the same example as given here but it doesn't work and throws the same error.

In your exception, there error is saying that the data you are providing it is the following:
{'userQuery': 'userQuery_value',
'isAutoSuggest': False,
'isLandingPage': False,
'correctedQuery': 'correctedQuery_value',
'isUserQuery': False,
'timestamp': 1597399323000,
'completeSolrQuery': 'completeSolrQuery_value',
'requestId': 'requestId_value'}
This is much less than what you claim you are providing it in your example.
Can you go back to your original code and on line 60 before you do producer.produce(topic=topic, key=key, value=value) just do a simple print(value) to make sure you are sending it the right value and that the value hasn't gotten overwritten by some other line of code.

Python Script to convert multiple json files in to single csv

{
"type": "Data",
"version": "1.0",
"box": {
"identifier": "abcdef",
"serial": "12345678"
},
"payload": {
"Type": "EL",
"Version": "1",
"Result": "Successful",
"Reference": null,
"Box": {
"Identifier": "abcdef",
"Serial": "12345678"
},
"Configuration": {
"EL": "1"
},
"vent": [
{
"ventType": "Arm",
"Timestamp": "2020-03-18T12:17:04+10:00",
"Parameters": [
{
"Name": "Arm",
"Value": "LT"
},
{
"Name": "Status",
"Value": "LD"
}
]
},
{
"ventType": "Arm",
"Timestamp": "2020-03-18T12:17:24+10:00",
"Parameters": [
{
"Name": "Arm",
"Value": "LT"
},
{
"Name": "Status",
"Value": "LD"
}
]
},
{
"EventType": "TimeUpdateCompleted",
"Timestamp": "2020-03-18T02:23:21.2979668Z",
"Parameters": [
{
"Name": "ActualAdjustment",
"Value": "PT0S"
},
{
"Name": "CorrectionOffset",
"Value": "PT0S"
},
{
"Name": "Latency",
"Value": "PT0.2423996S"
}
]
}
]
}
}

If you're looking to transfer information from a JSON file to a CSV, then you can use the following code to read in a JSON file into a dictionary in Python:
import json
with open('data.txt') as json_file:
data_dict = json.load(json_file)
You could then convert this dictionary into a list with either data_dict.items() or data_dict.values().
Then you just need to write this list to a CSV file which you can easily do by just looping through the list.

Confluent Python Avro Producer: The datum {'..'} is not an example of the schema

I'm unable to produce data for a specific schema and I'm unable to understand why. The Example data inlcuded as dictionary in the code was created directly from using the confluent "avro-random-generator", so the example data must be correct, since it's directly derived from the schema. Both, Schema Registry, and Avro Random Generator are Confluent Tools, so it can't be that there their tools produces example data that does not work with the their schema registry.
This is the Schema:
{
"type": "record",
"name": "schemaV1",
"namespace": "net.avro.schemaV1",
"doc": "",
"fields": [
{
"name": "orderId",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": ""
},
{
"name": "offerId",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": ""
},
{
"name": "redeemerId",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "",
"default": null
},
{
"name": "eventCancellationType",
"type": "int",
"doc": ""
},
{
"name": "ruleIds",
"type": {
"type": "array",
"items": {
"type": "string",
"avro.java.string": "String"
},
"doc": ""
}
},
{
"name": "eventOriginator",
"type": {
"type": "record",
"name": "AvroEventPartnerV1",
"doc": "",
"fields": [
{
"name": "partnerShortName",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": ""
},
{
"name": "businessUnitShortName",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "",
"default": null
},
{
"name": "branchShortName",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "",
"default": null
}
]
}
},
{
"name": "roundedDelta",
"doc": "",
"type": {
"type": "record",
"name": "AvroAmountV1",
"doc": "Amount with a currency",
"fields": [
{
"name": "amount",
"type": {
"type": "bytes",
"logicalType": "decimal",
"precision": 21,
"scale": 3
},
"doc": "The amount as a decimal number"
},
{
"name": "currency",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": ""
}
]
}
},
{
"name": "rewardableLegalDelta",
"type": [
"null",
"AvroAmountV1"
],
"doc": "",
"default": null
},
{
"name": "receiptNumber",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": ""
},
{
"name": "referenceReceiptNumber",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "",
"default": null
},
{
"name": "eventEffectiveTime",
"type": {
"type": "long"
},
"doc": ""
}
]
}
This is my script:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from confluent_kafka import avro
from confluent_kafka.avro import AvroProducer, ClientError, ValueSerializerError
BOOTSTRAP_SERVER = 'localhost:9092'
SCHEMA_REGISTRY = 'http://localhost:8081'
TOPIC = 'topicV1'
SCHEMA_PATH = 'schemas/schemaV1.avsc'
def schemaReader(SCHEMA_PATH):
with open(SCHEMA_PATH, 'r') as file:
data = file.read()
return data
def main():
kafka_config = {
'bootstrap.servers': BOOTSTRAP_SERVER,
'schema.registry.url': SCHEMA_REGISTRY
}
value_schema = avro.loads( schemaReader(SCHEMA_PATH) )
null = None
value = {
"orderId": "a9bcc55f-e2c0-43d6-b793-ff5f295d051d",
"offerId": "119475017578242889",
"redeemerId": "1176a01b-b2dc-45a9-91cc-232361e14f99",
"eventCancellationType": 0,
"ruleIds": ["ID-IPM00001"],
"eventOriginator": {"partnerShortName":
"partner","businessUnitShortName": null,"branchShortName": null},
"roundedDelta": {"amount": "\u8463","currency": "PTS"},
"rewardableLegalDelta": {"amount": "\u8463","currency": "EUR"},
"receiptNumber": "19b2ff68-ed06-48f0-9ce9-d697c0eadc19",
"referenceReceiptNumber": null,
"eventEffectiveTime": 1569494696656
}
avroProducer = AvroProducer(kafka_config, default_value_schema=value_schema )
avroProducer.produce(topic=TOPIC, value=value, value_schema=value_schema)
avroProducer.flush()
if __name__== "__main__":
main()
This is the traceback I'm receiving:
File "producer.py", line 64, in <module>
main()
File "producer.py", line 60, in main
avroProducer.produce(topic=TOPIC, value=value, value_schema=value_schema)
File "/apps/python/python2.7/lib/python2.7/site-packages/confluent_kafka/avro/__init__.py", line 80, in produce
value = self._serializer.encode_record_with_schema(topic, value_schema, value)
File "/apps/python/python2.7/lib/python2.7/site-packages/confluent_kafka/avro/serializer/message_serializer.py", line 115, in encode_record_with_schema
return self.encode_record_with_schema_id(schema_id, record, is_key=is_key)
File "/apps/python/python2.7/lib/python2.7/site-packages/confluent_kafka/avro/serializer/message_serializer.py", line 149, in encode_record_with_schema_id
writer(record, outf)
File "/apps/python/python2.7/lib/python2.7/site-packages/confluent_kafka/avro/serializer/message_serializer.py", line 86, in <lambda>
return lambda record, fp: writer.write(record, avro.io.BinaryEncoder(fp))
File "/apps/python/python2.7/lib/python2.7/site-packages/avro/io.py", line 1042, in write
raise AvroTypeException(self.writers_schema, datum)
avro.io.AvroTypeException: The datum {'..'} is not an example of the schema { ..}

It seems that the problem is that the amount should be a bytes type but you have a normal string of \u8463. The library you mentioned that you used to generate the random data creates a byte string by using the java default charset: https://github.com/confluentinc/avro-random-generator/blob/master/src/main/java/io/confluent/avro/random/generator/Generator.java#L373
However, perhaps that default isn't iso-8859-1 which is what the java implementation (the reference implementation) uses: https://github.com/apache/avro/blob/bf47ec97e0b7f5701042fac067b73b421a9177b7/lang/java/avro/src/main/java/org/apache/avro/io/JsonEncoder.java#L220

Why is JSON-validation always successful using this schema containing 'allOf'?

I have a JSON schema with which I want to validate some data, using python and the jsonschema module. However, this doesn't quite work as expected, as some of the accepted data doesn't appear valid at all (to me and the purpose of my application). Sadly, the schema is provided, so I can't change the schema itself - at least not manually.
This is a shortened version of the schema ('schema.json' in code below):
{
"type": "object",
"allOf": [
{
"type": "object",
"allOf": [
{
"type": "object",
"properties": {
"firstName": {
"type": "string"
},
"lastName": {
"type": "string"
}
}
},
{
"type": "object",
"properties": {
"language": {
"type": "integer"
}
}
}
]
},
{
"type": "object",
"properties": {
"addressArray": {
"type": "array",
"items": {
"type": "object",
"properties": {
"streetNumber": {
"type": "string"
},
"street": {
"type": "string"
},
"city": {
"type": "string"
}
}
}
}
}
}
]
}
This is an example of what should be a valid instance ('person.json' in code below):
{
"firstName": "Sherlock",
"lastName": "Holmes",
"language": 1,
"addresses": [
{
"streetNumber": "221B",
"street": "Baker Street",
"city": "London"
}
]
}
This is an example of what should be considered invalid ('no_person.json' in code below):
{
"name": "eggs",
"colour": "white"
}
And this is the code I used for validating:
from json import load
from jsonschema import Draft7Validator, exceptions
with open('schema.json') as f:
schema = load(f)
with open('person.json') as f:
person = load(f)
with open('no_person.json') as f:
no_person = load(f)
validator = Draft7Validator(schema)
try:
validator.validate(person)
print("person.json is valid")
except exceptions.ValidationError:
print("person.json is invalid")
try:
validator.validate(no_person)
print("no_person.json is valid")
except exceptions.ValidationError:
print("no_person.json is invalid")
Result:
person.json is valid
no_person.json is valid
I expected no_person.json to be invalid. What can there be done to have only data such as person.json to be validated successfully? Thank you very much for your help, I'm very new to this (spent ages searching for an answer).

This is work schema and pay attention on "required" (when there is no such key - if field is doesn't get it just skipped):
{
"type": "object",
"properties": {
"firstName": {
"type": "string"
},
"lastName": {
"type": "string"
},
"language": {
"type": "integer"
},
"addresses": {
"type": "array",
"items": {
"type": "object",
"properties": {
"streetNumber": {
"type": "string"
},
"street": {
"type": "string"
},
"city": {
"type": "string"
}
},
"required": [
"streetNumber",
"street",
"city"
]
}
}
},
"required": [
"firstName",
"lastName",
"language",
"addresses"
]
}
I've got:
person.json is valid
no_person.json is invalid
If you have hardest structure of response (array of objects, which contain objects etc) let me known

Json to CSV using python and blender 2.74

I have a project in which i have to convert a json file into a CSV file.
The Json sample :
{
"P_Portfolio Group": {
"depth": 1,
"dataType": "PortfolioOverview",
"levelId": "P_Portfolio Group",
"path": [
{
"label": "Portfolio Group",
"levelId": "P_Portfolio Group"
}
],
"label": "Portfolio Group",
"header": [
{
"id": "Label",
"label": "Security name",
"type": "text",
"contentType": "text"
},
{
"id": "SecurityValue",
"label": "MioCHF",
"type": "text",
"contentType": "number"
},
{
"id": "SecurityValuePct",
"label": "%",
"type": "text",
"contentType": "pct"
}
],
"data": [
{
"dataValues": [
{
"value": "Client1",
"type": "text"
},
{
"value": 2068.73,
"type": "number"
},
{
"value": 14.0584,
"type": "pct"
}
]
},
{
"dataValues": [
{
"value": "Client2",
"type": "text"
},
{
"value": 1511.9,
"type": "number"
},
{
"value": 10.2744,
"type": "pct"
}
]
},
{
"dataValues": [
{
"value": "Client3",
"type": "text"
},
{
"value": 1354.74,
"type": "number"
},
{
"value": 9.2064,
"type": "pct"
}
]
},
{
"dataValues": [
{
"value": "Client4",
"type": "text"
},
{
"value": 1225.78,
"type": "number"
},
{
"value": 8.33,
"type": "pct"
}
]
}
],
"summary": [
{
"value": "Total",
"type": "text"
},
{
"value": 11954.07,
"type": "number"
},
{
"value": 81.236,
"type": "pct"
}
]
}
}
And i want o obtain something like:
Client1,2068.73,14.0584
Client2,1511.9,10.2744
Client3,871.15,5.92
Client4,11954.07,81.236
Can you please give me a hint.
import csv
import json
with open("C:\Users\SVC\Desktop\test.json") as file:
x = json.load(file)
f = csv.writer(open("C:\Users\SVC\Desktop\test.csv", "wb+"))
for x in x:
f.writerow(x["P_Portfolio Group"]["data"]["dataValues"]["value"])
but it doesn't work.
Can you please give me a hint.

import csv
import json
with open('C:\Users\SVC\Desktop\test.json') as json_file:
portfolio_group = json.load(json_file)
with open('C:\Users\SVC\Desktop\test.csv', 'w') as csv_file:
csv_obj = csv.writer(csv_file)
for data in portfolio_group['P_Portfolio Group']['data']:
csv_obj.writerow([d['value'] for d in data['dataValues']])
This results in the following C:\Users\SVC\Desktop\test.csv content:
Client1,2068.73,14.0584
Client2,1511.9,10.2744
Client3,1354.74,9.2064
Client4,1225.78,8.33

Use the pandas library:
import pandas as pd
data = pd.read_csv("C:\Users\SVC\Desktop\test.json")
data.to_csv('test.csv')
done

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

fastavro - Convert json file into avro file - python

Related

Unable to write avro data to kafka using python

Python Script to convert multiple json files in to single csv

Confluent Python Avro Producer: The datum {'..'} is not an example of the schema

Why is JSON-validation always successful using this schema containing 'allOf'?

Json to CSV using python and blender 2.74

Categories

Resources