I'm using kafka kafka_2.11-0.11.0.2 and confluent version 3.3.0 for schema registry.
I have defined an avro schema as follows:
{
"namespace": "com.myntra.search",
"type": "record",
"name": "SearchDataIngestionObject",
"fields": [
{"name": "timestamp","type":"long"},
{"name": "brandList", "type":{ "type" : "array", "items" : "string" }},
{"name": "articleTypeList", "type":{ "type" : "array", "items" : "string" }},
{"name": "gender", "type":{ "type" : "array", "items" : "string" }},
{"name": "masterCategoryList", "type":{ "type" : "array", "items" : "string" }},
{"name": "subCategoryList", "type":{ "type" : "array", "items" : "string" }},
{"name": "quAlgo","type":{ "type" : "array", "items" : "string" }},
{"name": "colours", "type":{ "type" : "array", "items" : "string" }},
{"name": "isLandingPage", "type": "boolean"},
{"name": "isUserQuery", "type": "boolean"},
{"name": "isAutoSuggest", "type": "boolean"},
{"name": "userQuery", "type": "string"},
{"name": "correctedQuery", "type": "string"},
{"name": "completeSolrQuery", "type": "string"},
{"name": "atsaList", "type":{"type": "map", "values":{ "type" : "array", "items" : "string" }}},
{"name": "quMeta", "type": {"type": "map", "values": "string"}},
{"name": "requestId", "type": "string"}
]
}
And I'm trying to write some data to kafka as follows:
value = {
"timestamp": 1597399323000,
"brandList": ["brand_value"],
"articleTypeList": ["articleType_value"],
"gender": ["gender_value"],
"masterCategoryList": ["masterCategory_value"],
"subCategoryList": ["subCategory_value"],
"quAlgo": ["quAlgo_value"],
"colours": ["colours_value"],
"isLandingPage": False,
"isUserQuery": False,
"isAutoSuggest": False,
"userQuery": "userQuery_value",
"correctedQuery": "correctedQuery_value",
"completeSolrQuery": "completeSolrQuery_value",
"atsaList": {
"atsa_key1": ["atsa_value1"],
"atsa_key2": ["atsa_value2"],
"atsa_key3": ["atsa_value3"]
},
"quMeta": {
"quMeta_key1": "quMeta_value1",
"quMeta_key2": "quMeta_value2",
"quMeta_key3": "quMeta_value3"
},
"requestId": "requestId_value"
}
topic = "search"
key = str(uuid.uuid4())
producer.produce(topic=topic, key=key, value=value)
producer.flush()
But I'm getting the following error:
Traceback (most recent call last):
File "producer.py", line 61, in <module>
producer.produce(topic=topic, key=key, value=value)
File "/Library/Python/2.7/site-packages/confluent_kafka/avro/__init__.py", line 99, in produce
value = self._serializer.encode_record_with_schema(topic, value_schema, value)
File "/Library/Python/2.7/site-packages/confluent_kafka/avro/serializer/message_serializer.py", line 118, in encode_record_with_schema
return self.encode_record_with_schema_id(schema_id, record, is_key=is_key)
File "/Library/Python/2.7/site-packages/confluent_kafka/avro/serializer/message_serializer.py", line 152, in encode_record_with_schema_id
writer(record, outf)
File "/Library/Python/2.7/site-packages/confluent_kafka/avro/serializer/message_serializer.py", line 86, in <lambda>
return lambda record, fp: writer.write(record, avro.io.BinaryEncoder(fp))
File "/Library/Python/2.7/site-packages/avro/io.py", line 979, in write
raise AvroTypeException(self.writers_schema, datum)
avro.io.AvroTypeException: The datum {'quAlgo': ['quAlgo_value'], 'userQuery': 'userQuery_value', 'isAutoSuggest': False, 'isLandingPage': False, 'timestamp': 1597399323000, 'articleTypeList': ['articleType_value'], 'colours': ['colours_value'], 'correctedQuery': 'correctedQuery_value', 'quMeta': {'quMeta_key1': 'quMeta_value1', 'quMeta_key2': 'quMeta_value2', 'quMeta_key3': 'quMeta_value3'}, 'requestId': 'requestId_value', 'gender': ['gender_value'], 'isUserQuery': False, 'brandList': ['brand_value'], 'masterCategoryList': ['masterCategory_value'], 'subCategoryList': ['subCategory_value'], 'completeSolrQuery': 'completeSolrQuery_value', 'atsaList': {'atsa_key1': ['atsa_value1'], 'atsa_key2': ['atsa_value2'], 'atsa_key3': ['atsa_value3']}} is not an example of the schema {
"namespace": "com.myntra.search",
"type": "record",
"name": "SearchDataIngestionObject",
"fields": [
{
"type": "long",
"name": "timestamp"
},
{
"type": {
"items": "string",
"type": "array"
},
"name": "brandList"
},
{
"type": {
"items": "string",
"type": "array"
},
"name": "articleTypeList"
},
{
"type": {
"items": "string",
"type": "array"
},
"name": "gender"
},
{
"type": {
"items": "string",
"type": "array"
},
"name": "masterCategoryList"
},
{
"type": {
"items": "string",
"type": "array"
},
"name": "subCategoryList"
},
{
"type": {
"items": "string",
"type": "array"
},
"name": "quAlgo"
},
{
"type": {
"items": "string",
"type": "array"
},
"name": "colours"
},
{
"type": "boolean",
"name": "isLandingPage"
},
{
"type": "boolean",
"name": "isUserQuery"
},
{
"type": "boolean",
"name": "isAutoSuggest"
},
{
"type": "string",
"name": "userQuery"
},
{
"type": "string",
"name": "correctedQuery"
},
{
"type": "string",
"name": "completeSolrQuery"
},
{
"type": {
"values": {
"items": "string",
"type": "array"
},
"type": "map"
},
"name": "atsaList"
},
{
"type": {
"values": "string",
"type": "map"
},
"name": "quMeta"
},
{
"type": "string",
"name": "requestId"
}
]
}
I even trying the same example as given here but it doesn't work and throws the same error.
In your exception, there error is saying that the data you are providing it is the following:
{'userQuery': 'userQuery_value',
'isAutoSuggest': False,
'isLandingPage': False,
'correctedQuery': 'correctedQuery_value',
'isUserQuery': False,
'timestamp': 1597399323000,
'completeSolrQuery': 'completeSolrQuery_value',
'requestId': 'requestId_value'}
This is much less than what you claim you are providing it in your example.
Can you go back to your original code and on line 60 before you do producer.produce(topic=topic, key=key, value=value) just do a simple print(value) to make sure you are sending it the right value and that the value hasn't gotten overwritten by some other line of code.
Related
A bit new to avro & python.
I am trying to do a simple conversion to avro using the fastavro library, as the speed of the native apache avro library is just a bit too slow.
I want to:
1.Take a json file
2. Convert the data to avro.
My problem is that it seems like my json isn't in the correct 'record' format to be converted to avro. I even tried putting my json into a string var and making it looks similar to the syntax they have on the site # https://fastavro.readthedocs.io/en/latest/writer.html:
{u'station': u'011990-99999', u'temp': 22, u'time': 1433270389},
{u'station': u'011990-99999', u'temp': -11, u'time': 1433273379},
{u'station': u'012650-99999', u'temp': 111, u'time': 1433275478},
Here is my code:
from fastavro import json_writer, parse_schema, writer
import json
key = "test.json"
schemaFileName = "test_schema.avsc"
with open(r'C:/Path/to/file' + schemaFileName) as sc:
w = json.load(sc)
schema = parse_schema(w)
with open(r'C:/Path/to/file/' + key) as js:
x=json.load(js)
with open('C:/Path/to/file/output.avro', 'wb') as out:
writer(out, schema,x, codec='deflate')
Here is what I get as output:
File "avropython.py", line 26, in <module>
writer(out, schema,x, codec='deflate')
File "fastavro\_write.pyx", line 608, in fastavro._write.writer
ValueError: "records" argument should be an iterable, not dict
My json file and schema, rspectively:
"joined": false,
"toward": {
"selection": "dress",
"near": true,
"shoulder": false,
"fine": -109780201.3804388,
"pet": {
"stood": "saddle",
"live": false,
"leather": false,
"tube": false,
"over": false,
"impossible": true
},
"higher": false
},
"wear": true,
"asleep": "door",
"connected": true,
"stairs": -1195512399.5000324
}
{
"name": "MyClass",
"type": "record",
"namespace": "com.acme.avro",
"fields": [
{
"name": "joined",
"type": "boolean"
},
{
"name": "toward",
"type": {
"name": "toward",
"type": "record",
"fields": [
{
"name": "selection",
"type": "string"
},
{
"name": "near",
"type": "boolean"
},
{
"name": "shoulder",
"type": "boolean"
},
{
"name": "fine",
"type": "float"
},
{
"name": "pet",
"type": {
"name": "pet",
"type": "record",
"fields": [
{
"name": "stood",
"type": "string"
},
{
"name": "live",
"type": "boolean"
},
{
"name": "leather",
"type": "boolean"
},
{
"name": "tube",
"type": "boolean"
},
{
"name": "over",
"type": "boolean"
},
{
"name": "impossible",
"type": "boolean"
}
]
}
},
{
"name": "higher",
"type": "boolean"
}
]
}
},
{
"name": "wear",
"type": "boolean"
},
{
"name": "asleep",
"type": "string"
},
{
"name": "connected",
"type": "boolean"
},
{
"name": "stairs",
"type": "float"
}
]
}
If anyone could help me out, it would be greatly appreciated!!
As mentioned in the error ValueError: "records" argument should be an iterable, not dict, the problem is that when you call writer, the argument for the records needs to be an iterable. One way to solve this is to change your last line to writer(out, schema, [x], codec='deflate')
Alternatively, there is a schemaless_writer that can be used to just write a single record: https://fastavro.readthedocs.io/en/latest/writer.html#fastavro._write_py.schemaless_writer
I'm unable to produce data for a specific schema and I'm unable to understand why. The Example data inlcuded as dictionary in the code was created directly from using the confluent "avro-random-generator", so the example data must be correct, since it's directly derived from the schema. Both, Schema Registry, and Avro Random Generator are Confluent Tools, so it can't be that there their tools produces example data that does not work with the their schema registry.
This is the Schema:
{
"type": "record",
"name": "schemaV1",
"namespace": "net.avro.schemaV1",
"doc": "",
"fields": [
{
"name": "orderId",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": ""
},
{
"name": "offerId",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": ""
},
{
"name": "redeemerId",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "",
"default": null
},
{
"name": "eventCancellationType",
"type": "int",
"doc": ""
},
{
"name": "ruleIds",
"type": {
"type": "array",
"items": {
"type": "string",
"avro.java.string": "String"
},
"doc": ""
}
},
{
"name": "eventOriginator",
"type": {
"type": "record",
"name": "AvroEventPartnerV1",
"doc": "",
"fields": [
{
"name": "partnerShortName",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": ""
},
{
"name": "businessUnitShortName",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "",
"default": null
},
{
"name": "branchShortName",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "",
"default": null
}
]
}
},
{
"name": "roundedDelta",
"doc": "",
"type": {
"type": "record",
"name": "AvroAmountV1",
"doc": "Amount with a currency",
"fields": [
{
"name": "amount",
"type": {
"type": "bytes",
"logicalType": "decimal",
"precision": 21,
"scale": 3
},
"doc": "The amount as a decimal number"
},
{
"name": "currency",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": ""
}
]
}
},
{
"name": "rewardableLegalDelta",
"type": [
"null",
"AvroAmountV1"
],
"doc": "",
"default": null
},
{
"name": "receiptNumber",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": ""
},
{
"name": "referenceReceiptNumber",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "",
"default": null
},
{
"name": "eventEffectiveTime",
"type": {
"type": "long"
},
"doc": ""
}
]
}
This is my script:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from confluent_kafka import avro
from confluent_kafka.avro import AvroProducer, ClientError, ValueSerializerError
BOOTSTRAP_SERVER = 'localhost:9092'
SCHEMA_REGISTRY = 'http://localhost:8081'
TOPIC = 'topicV1'
SCHEMA_PATH = 'schemas/schemaV1.avsc'
def schemaReader(SCHEMA_PATH):
with open(SCHEMA_PATH, 'r') as file:
data = file.read()
return data
def main():
kafka_config = {
'bootstrap.servers': BOOTSTRAP_SERVER,
'schema.registry.url': SCHEMA_REGISTRY
}
value_schema = avro.loads( schemaReader(SCHEMA_PATH) )
null = None
value = {
"orderId": "a9bcc55f-e2c0-43d6-b793-ff5f295d051d",
"offerId": "119475017578242889",
"redeemerId": "1176a01b-b2dc-45a9-91cc-232361e14f99",
"eventCancellationType": 0,
"ruleIds": ["ID-IPM00001"],
"eventOriginator": {"partnerShortName":
"partner","businessUnitShortName": null,"branchShortName": null},
"roundedDelta": {"amount": "\u8463","currency": "PTS"},
"rewardableLegalDelta": {"amount": "\u8463","currency": "EUR"},
"receiptNumber": "19b2ff68-ed06-48f0-9ce9-d697c0eadc19",
"referenceReceiptNumber": null,
"eventEffectiveTime": 1569494696656
}
avroProducer = AvroProducer(kafka_config, default_value_schema=value_schema )
avroProducer.produce(topic=TOPIC, value=value, value_schema=value_schema)
avroProducer.flush()
if __name__== "__main__":
main()
This is the traceback I'm receiving:
File "producer.py", line 64, in <module>
main()
File "producer.py", line 60, in main
avroProducer.produce(topic=TOPIC, value=value, value_schema=value_schema)
File "/apps/python/python2.7/lib/python2.7/site-packages/confluent_kafka/avro/__init__.py", line 80, in produce
value = self._serializer.encode_record_with_schema(topic, value_schema, value)
File "/apps/python/python2.7/lib/python2.7/site-packages/confluent_kafka/avro/serializer/message_serializer.py", line 115, in encode_record_with_schema
return self.encode_record_with_schema_id(schema_id, record, is_key=is_key)
File "/apps/python/python2.7/lib/python2.7/site-packages/confluent_kafka/avro/serializer/message_serializer.py", line 149, in encode_record_with_schema_id
writer(record, outf)
File "/apps/python/python2.7/lib/python2.7/site-packages/confluent_kafka/avro/serializer/message_serializer.py", line 86, in <lambda>
return lambda record, fp: writer.write(record, avro.io.BinaryEncoder(fp))
File "/apps/python/python2.7/lib/python2.7/site-packages/avro/io.py", line 1042, in write
raise AvroTypeException(self.writers_schema, datum)
avro.io.AvroTypeException: The datum {'..'} is not an example of the schema { ..}
It seems that the problem is that the amount should be a bytes type but you have a normal string of \u8463. The library you mentioned that you used to generate the random data creates a byte string by using the java default charset: https://github.com/confluentinc/avro-random-generator/blob/master/src/main/java/io/confluent/avro/random/generator/Generator.java#L373
However, perhaps that default isn't iso-8859-1 which is what the java implementation (the reference implementation) uses: https://github.com/apache/avro/blob/bf47ec97e0b7f5701042fac067b73b421a9177b7/lang/java/avro/src/main/java/org/apache/avro/io/JsonEncoder.java#L220
I have a JSON structure which needs some code to be appended. I tried with SED and bash, that only appends at the end of a string or file, not the end of the structure.
{
"$schema": "http://json-schema.org/draft-04/schema#",
"required": [
"accounts"
],
"accounts": {
"required": "account",
"properties": {
"account": {
"type": "array",
"minItems": 1,
"maxItems": 999,
"required": [
"scheme",
"accountType",
"accountSubType"
],
"items": {
"type": "object",
"properties": {
"scheme": {
"description": "scheme",
"type": "object",
"required": [
"schemeName",
"identification"
],
"properties": {
"schemeName": {
"type": "string",
"maxLength": 40
},
"identification": {
"type": "string",
"maxLength": 256
},
"name": {
"type": "string",
"maxLength": 70
},
"secondaryIdentification": {
"type": "string",
"maxLength": 35
}
}
},
"currency": {
"type": "string",
"format": "iso-4217",
"pattern": "^[A-Z]{3,3}$",
"maxLength": 3,
"example": "EUR"
},
"accountType": {
"type": "string"
},
"accountSubType": {
"type": "string",
"maxLength": 35
}
}
}
}
}
}
}
I would like to update the above as
{
"$schema": "http://json-schema.org/draft-04/schema#",
"required": [
"accounts"
],
"accounts": {
"required": "account",
"properties": {
"account": {
"type": "array",
"minItems": 1,
"maxItems": 999,
"required": [
"scheme",
"accountType",
"accountSubType"
],
"items": {
"type": "object",
"properties": {
"scheme": {
"description": "scheme",
"type": "object",
"required": [
"schemeName",
"identification"
],
"properties": {
"schemeName": {
"type": "string",
"maxLength": 40
},
"identification": {
"type": "string",
"maxLength": 256
},
"name": {
"type": "string",
"maxLength": 70
},
"secondaryIdentification": {
"type": "string",
"maxLength": 35
}
},
"additionalProperties": false
},
"currency": {
"type": "string",
"format": "iso-4217",
"pattern": "^[A-Z]{3,3}$",
"maxLength": 3,
"example": "EUR"
},
"accountType": {
"type": "string"
},
"accountSubType": {
"type": "string",
"maxLength": 35
}
},
"additionalProperties": false
}
}
},
"additionalProperties": false
}
}
The difference is at the end of every "properties" section. I have appened it with "additionalProperties": false
Is there a way to do this through a script I can check and append all properties with that?
You can do this with jq (Requires jq 1.6 because it uses the walk() function to traverse the entire structure):
$ jq 'walk(if type == "object" and has("properties") then . + { additionalProperties: false } else . end)' your.json
Does it matter if "additionalProperties" comes after or before "properties"?
If not, you could use sed to add "additionalProperties" before the object "properties" like this:
sed -E 's/([[:space:]]*)"properties": {/\1"additionalProperties": false,|\1"properties": {/g'| tr '|' '\n'
With you you will get
{
"$schema": "http://json-schema.org/draft-04/schema#",
"required": [
"accounts"
],
"accounts": {
"required": "account",
"additionalProperties": false,
"properties": {
"account": {
"type": "array",
"minItems": 1,
"maxItems": 999,
"required": [
"scheme",
"accountType",
"accountSubType"
],
"items": {
"type": "object",
"additionalProperties": false,
"properties": {
"scheme": {
"description": "scheme",
"type": "object",
"required": [
"schemeName",
"identification"
],
"additionalProperties": false,
"properties": {
"schemeName": {
"type": "string",
"maxLength": 40
},
"identification": {
"type": "string",
"maxLength": 256
},
"name": {
"type": "string",
"maxLength": 70
},
"secondaryIdentification": {
"type": "string",
"maxLength": 35
}
}
},
"currency": {
"type": "string",
"format": "iso-4217",
"pattern": "^[A-Z]{3,3}$",
"maxLength": 3,
"example": "EUR"
},
"accountType": {
"type": "string"
},
"accountSubType": {
"type": "string",
"maxLength": 35
}
}
}
}
}
}
}
I have a large JSON file, about 5 million records and a file size of about 32GB, that I need to get loaded into our Snowflake Data Warehouse. I need to get this file broken up into chunks of about 200k records (about 1.25GB) per file. I'd like to do this in either Node.JS or Python for deployment to an AWS Lambda function, unfortunately I haven't coded in either, yet. I have C# and a lot of SQL experience, and learning both node and python are on my to do list, so why not dive right in, right!?
My first question is "Which language would better serve this function? Python, or Node.JS?"
I know I don't want to read this entire JSON file into memory (or even the output smaller file). I need to be able to "stream" it in and out into the new file based on a record count (200k), properly close up the json objects, and continue into a new file for another 200k, and so on. I know Node can do this, but if Python can also do this, I feel like it would be easier to quickly start using for other ETL stuff I'll be doing soon.
My second question is "Based on your recommendation above, can you also recommend what modules I should require/import to help me get started? Primarily as it relates to not pulling the entire json file into memory? Maybe some tips, tricks, or 'How would you do it's? And if you're feeling really generous, some code example to help push me into the deep end on this?
I can't include a sample of the JSON data, as it contains personal information. But I can provide the JSON schema ...
{
"$schema": "http://json-schema.org/draft-04/schema#",
"items": {
"properties": {
"activities": {
"properties": {
"activity_id": {
"items": {
"type": "integer"
},
"type": "array"
},
"frontlineorg_id": {
"items": {
"type": "integer"
},
"type": "array"
},
"import_id": {
"items": {
"type": "integer"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"is_source": {
"items": {
"type": "boolean"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"address": {
"properties": {
"city": {
"items": {
"type": "string"
},
"type": "array"
},
"congress_dist_name": {
"items": {
"type": "string"
},
"type": "array"
},
"congress_dist_number": {
"items": {
"type": "integer"
},
"type": "array"
},
"congress_end_yr": {
"items": {
"type": "integer"
},
"type": "array"
},
"congress_number": {
"items": {
"type": "integer"
},
"type": "array"
},
"congress_start_yr": {
"items": {
"type": "integer"
},
"type": "array"
},
"county": {
"items": {
"type": "string"
},
"type": "array"
},
"formatted": {
"items": {
"type": "string"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"latitude": {
"items": {
"type": "number"
},
"type": "array"
},
"longitude": {
"items": {
"type": "number"
},
"type": "array"
},
"number": {
"items": {
"type": "string"
},
"type": "array"
},
"observes_dst": {
"items": {
"type": "boolean"
},
"type": "array"
},
"post_directional": {
"items": {
"type": "null"
},
"type": "array"
},
"pre_directional": {
"items": {
"type": "null"
},
"type": "array"
},
"school_district": {
"items": {
"properties": {
"school_dist_name": {
"items": {
"type": "string"
},
"type": "array"
},
"school_dist_type": {
"items": {
"type": "string"
},
"type": "array"
},
"school_grade_high": {
"items": {
"type": "string"
},
"type": "array"
},
"school_grade_low": {
"items": {
"type": "string"
},
"type": "array"
},
"school_lea_code": {
"items": {
"type": "integer"
},
"type": "array"
}
},
"type": "object"
},
"type": "array"
},
"secondary_number": {
"items": {
"type": "null"
},
"type": "array"
},
"secondary_unit": {
"items": {
"type": "null"
},
"type": "array"
},
"state": {
"items": {
"type": "string"
},
"type": "array"
},
"state_house_dist_name": {
"items": {
"type": "string"
},
"type": "array"
},
"state_house_dist_number": {
"items": {
"type": "integer"
},
"type": "array"
},
"state_senate_dist_name": {
"items": {
"type": "string"
},
"type": "array"
},
"state_senate_dist_number": {
"items": {
"type": "integer"
},
"type": "array"
},
"street": {
"items": {
"type": "string"
},
"type": "array"
},
"suffix": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"timezone": {
"items": {
"type": "string"
},
"type": "array"
},
"utc_offset": {
"items": {
"type": "integer"
},
"type": "array"
},
"zip": {
"items": {
"type": "integer"
},
"type": "array"
}
},
"type": "object"
},
"age": {
"type": "integer"
},
"anniversary": {
"properties": {
"date": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"baptism": {
"properties": {
"church_id": {
"type": "null"
},
"date": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"birth_dd": {
"type": "integer"
},
"birth_mm": {
"type": "integer"
},
"birth_yyyy": {
"type": "integer"
},
"church_attendance": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"likelihood": {
"items": {
"type": "integer"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"cohabiting": {
"properties": {
"confidence": {
"items": {
"type": "string"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"likelihood": {
"items": {
"type": "null"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"dating": {
"properties": {
"bool": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"divorced": {
"properties": {
"bool": {
"items": {
"type": "null"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"likelihood_considering": {
"items": {
"type": "integer"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"education": {
"properties": {
"est_level": {
"items": {
"type": "string"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"email": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"is_work_school": {
"items": {
"type": "boolean"
},
"type": "array"
},
"string": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"engaged": {
"properties": {
"insert_datetime_utc": {
"type": "null"
},
"likelihood": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"est_income": {
"properties": {
"est_level": {
"items": {
"type": "string"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"ethnicity": {
"type": "string"
},
"first_name": {
"type": "string"
},
"formatted_birthdate": {
"type": "string"
},
"gender": {
"type": "string"
},
"head_of_household": {
"properties": {
"bool": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"home_church": {
"properties": {
"church_id": {
"type": "null"
},
"group_participant": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"is_coaching": {
"type": "null"
},
"is_giving": {
"type": "null"
},
"is_serving": {
"type": "null"
},
"membership_date": {
"type": "null"
},
"regular_attendee": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"hub_poid": {
"type": "integer"
},
"insert_datetime_utc": {
"type": "string"
},
"ip_address": {
"properties": {
"insert_datetime_utc": {
"type": "null"
},
"string": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"last_name": {
"type": "string"
},
"marriage_segment": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"string": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"married": {
"properties": {
"bool": {
"items": {
"type": "boolean"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"middle_name": {
"type": "string"
},
"miscellaneous": {
"properties": {
"attribute": {
"items": {
"type": "string"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"value": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"name_suffix": {
"type": "null"
},
"name_title": {
"type": "null"
},
"newlywed": {
"properties": {
"bool": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"parent": {
"properties": {
"bool": {
"items": {
"type": "boolean"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"likelihood_expecting": {
"items": {
"type": "integer"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"person_id": {
"type": "integer"
},
"phone": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"number": {
"items": {
"type": "integer"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"type": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"property_rights": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"string": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"psychographic_cluster": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"string": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"religion": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"string": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"religious_segment": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"string": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"separated": {
"properties": {
"bool": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"significant_other": {
"properties": {
"first_name": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"last_name": {
"type": "null"
},
"middle_name": {
"type": "null"
},
"name_suffix": {
"type": "null"
},
"name_title": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"suppressed_datetime_utc": {
"type": "string"
},
"target_group": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"string": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
}
},
"type": "object"
},
"type": "array"
}
Use this code in linux command prompt
split -b 53750k <your-file>
cat xa* > <your-file>
Refer to this link:
https://askubuntu.com/questions/28847/text-editor-to-edit-large-4-3-gb-plain-text-file
Answering the question whether Python or Node will be better for the task would be an opinion and we are not allowed to voice our opinions on Stack Overflow. You have to decide yourself what you have more experience in and what you want to work with - Python or Node.
If you go with Node, there are some modules that can help you with that task, that do streaming JSON parsing. E.g. those modules:
https://www.npmjs.com/package/JSONStream
https://www.npmjs.com/package/stream-json
https://www.npmjs.com/package/json-stream
If you go with Python, there are streaming JSON parsers here as well:
https://github.com/kashifrazzaqui/json-streamer
https://github.com/danielyule/naya
http://www.enricozini.org/blog/2011/tips/python-stream-json/
consider to use jq to preprocessing your json files
it could split and stream your large json files
jq is like sed for JSON data - you can use it to slice
and filter and map and transform structured data with
the same ease that sed, awk, grep and friends let you play with text.
see the official documentation and this questions for more.
extra: for your first questions jq is written by C, it's faster than python/node isn't it ?
Snowflake has a very special treatment for JSON and if we understand them, it would be easy to draw the design.
JSON/Parquet/Avro/XML is considered as semi-structure data
They are stored as Variant data type in Snowflake.
While loading the JSON data into stage location, flag the strip_outer_array=true
copy into <table>
from #~/<file>.json
file_format = (type = 'JSON' strip_outer_array = true);
Each row size can not exceed 16Mb compressed when loaded in snowflake.
Snowflake data loading works well if the file size is split in the range of 10-100Mb in size.
Use the utilities which can split the file based on per line and have the file size note more than 100Mb and that brings the power of parallelism as well as accuracy for your data.
As per your data set size, you will get around 31K small files (of 100Mb size).
It means that the 31k parallel process run, however, it is not possible.
So choose an x-large size warehouse (16 v-core & 32 threads)
31k/32 = (approximately) 1000 rounds
This will not take more than a few minutes to load data based on your network bandwidth. Even if we think of 3sec per round, it may load the data in 50min.
Look at the warehouse configuration & throughput details and refer semi-structured data loading best practice.
The easiest approach that worked for me was this:
json_file = <your_file>
chunks = 200
for i in range(0,len(json_file), chunks):
print(json_file[i:i+chunks])
To split and compress at the same time with bash, resulting in files of ~100MB each:
cat bigfile.json | split -C 1000000000 -d -a4 - output_prefix --filter='gzip > $FILE.gz'
See more: https://stackoverflow.com/a/68718176/132438
You can use Python3 with the following script:
import json
def split_json(file_path):
with open(file_path, 'r') as json_file:
data = json.load(json_file)
chunk_size = len(data) // 3
for i in range(3):
with open(f"part{i}.json", 'w') as outfile:
outfile.write(json.dumps(data[i*chunk_size:(i+1)*chunk_size]))
file_path = input("Enter the file path of the JSON file: ")
split_json(file_path)
I've writen a small chunk os json schema but I'm getting a validation error using python jsonschema.
Here is my schema:
{
"$schema": "http://json-schema.org/draft-04/schema#",
"definitions": {
"output": {
"type": "object",
"properties": {
"Type": {
"type": "object",
"properties": {
"Type": {
"type": "string"
},
"Value": {
"type": "string"
},
"Default": {
"type": "string"
},
"Description": {
"type": "string"
},
"Options": {
"type": "array"
}
},
"required": [
"Type",
"Value",
"Default",
"Description",
"Options"
]
},
"Inverted": {
"type": "object",
"properties": {
"Type": {
"type": "string"
},
"Value": {
"type": "bool"
},
"Default": {
"type": "bool"
},
"Description": {
"type": "string"
}
},
"required": [
"Type",
"Value",
"Default",
"Description"
]
},
"Pulse Width": {
"type": "object",
"properties": {
"Type": {
"type": "string"
},
"Value": {
"type": "number"
},
"Default": {
"type": "number"
},
"Description": {
"type": "string"
}
},
"required": [
"Type",
"Value",
"Default",
"Description"
]
}
},
"required": [
"Type",
"Inverted",
"Pulse Width"
]
}
}
}
Here is the error I'm receiving:
Failed validating u'type' in schema
I'm attempting to validate my schema with:
schema = ""
with open(jsonSchemaFilePath, 'r') as schema_file:
schema = schema_file.read()
try:
Draft4Validator.check_schema(schema)
except SchemaError as schemaError:
print schemaError
What am I doing wrong with the schema I've written? Am I not allowed to have a property named Type?
My problem was Draft4Validator.check_schema takes a dic not a string, nor a json object.
Here was my solution:
schema = {}
with open(jsonSchemaFilePath, 'r') as schema_file:
schema = json.loads(schema_file.read())
try:
Draft4Validator.check_schema(schema)
except SchemaError as schemaError:
print schemaError