Related
I have an AST tree of javascript in the form of json (Dictionary).
I need to extract only the information about the ifstatement (entire if condition block) in the AST tree using python, so that I can tokenize the extracted data and use for some deep learning tasks.
{
"type": "Program",
"body": [
{
"type": "ExpressionStatement",
"expression": {
"type": "AssignmentExpression",
"operator": "=",
"left": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "MemberExpression",
"computed": true,
"object": {
"type": "Identifier",
"name": "Template"
},
"property": {
"type": "CallExpression",
"callee": {
"type": "Identifier",
"name": "getTemplate"
},
"arguments": [
{
"type": "Literal",
"value": "layout",
"raw": "'layout'"
}
]
}
},
"property": {
"type": "Identifier",
"name": "rendered"
}
},
"right": {
"type": "FunctionExpression",
"id": null,
"params": [],
"body": {
"type": "BlockStatement",
"body": [
{
"type": "IfStatement",
"test": {
"type": "AssignmentExpression",
"operator": "=",
"left": {
"type": "Identifier",
"name": "currentScroll"
},
"right": {
"type": "CallExpression",
"callee": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "Identifier",
"name": "Session"
},
"property": {
"type": "Identifier",
"name": "get"
}
},
"arguments": [
{
"type": "Literal",
"value": "currentScroll",
"raw": "'currentScroll'"
}
]
}
},
"consequent": {
"type": "BlockStatement",
"body": [
{
"type": "ExpressionStatement",
"expression": {
"type": "CallExpression",
"callee": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "CallExpression",
"callee": {
"type": "Identifier",
"name": "$"
},
"arguments": [
{
"type": "Literal",
"value": "body",
"raw": "'body'"
}
]
},
"property": {
"type": "Identifier",
"name": "scrollTop"
}
},
"arguments": [
{
"type": "Identifier",
"name": "currentScroll"
}
]
}
},
{
"type": "ExpressionStatement",
"expression": {
"type": "CallExpression",
"callee": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "Identifier",
"name": "Session"
},
"property": {
"type": "Identifier",
"name": "set"
}
},
"arguments": [
{
"type": "Literal",
"value": "currentScroll",
"raw": "'currentScroll'"
},
{
"type": "Literal",
"value": null,
"raw": "null"
}
]
}
}
]
},
"alternate": null
},
{
"type": "VariableDeclaration",
"declarations": [
{
"type": "VariableDeclarator",
"id": {
"type": "Identifier",
"name": "link"
},
"init": {
"type": "CallExpression",
"callee": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "Identifier",
"name": "document"
},
"property": {
"type": "Identifier",
"name": "createElement"
}
},
"arguments": [
{
"type": "Literal",
"value": "link",
"raw": "'link'"
}
]
}
}
],
"kind": "var"
},
{
"type": "ExpressionStatement",
"expression": {
"type": "AssignmentExpression",
"operator": "=",
"left": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "Identifier",
"name": "link"
},
"property": {
"type": "Identifier",
"name": "type"
}
},
"right": {
"type": "Literal",
"value": "image/x-icon",
"raw": "'image/x-icon'"
}
}
},
{
"type": "ExpressionStatement",
"expression": {
"type": "AssignmentExpression",
"operator": "=",
"left": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "Identifier",
"name": "link"
},
"property": {
"type": "Identifier",
"name": "rel"
}
},
"right": {
"type": "Literal",
"value": "shortcut icon",
"raw": "'shortcut icon'"
}
}
},
{
"type": "ExpressionStatement",
"expression": {
"type": "AssignmentExpression",
"operator": "=",
"left": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "Identifier",
"name": "link"
},
"property": {
"type": "Identifier",
"name": "href"
}
},
"right": {
"type": "CallExpression",
"callee": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "Identifier",
"name": "Settings"
},
"property": {
"type": "Identifier",
"name": "get"
}
},
"arguments": [
{
"type": "Literal",
"value": "faviconUrl",
"raw": "'faviconUrl'"
},
{
"type": "Literal",
"value": "/img/favicon.ico",
"raw": "'/img/favicon.ico'"
}
]
}
}
},
{
"type": "ExpressionStatement",
"expression": {
"type": "CallExpression",
"callee": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "MemberExpression",
"computed": true,
"object": {
"type": "CallExpression",
"callee": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "Identifier",
"name": "document"
},
"property": {
"type": "Identifier",
"name": "getElementsByTagName"
}
},
"arguments": [
{
"type": "Literal",
"value": "head",
"raw": "'head'"
}
]
},
"property": {
"type": "Literal",
"value": 0,
"raw": "0"
}
},
"property": {
"type": "Identifier",
"name": "appendChild"
}
},
"arguments": [
{
"type": "Identifier",
"name": "link"
}
]
}
},
{
"type": "ExpressionStatement",
"expression": {
"type": "CallExpression",
"callee": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "CallExpression",
"callee": {
"type": "Identifier",
"name": "$"
},
"arguments": [
{
"type": "Literal",
"value": "a.category-silent-hangout",
"raw": "'a.category-silent-hangout'"
}
]
},
"property": {
"type": "Identifier",
"name": "after"
}
},
"arguments": [
{
"type": "Literal",
"value": "<span class=\"silent-icons\"> <img src=\"http://codebuddies.org/images/icon-video-off.png\" alt=\"turn off video\" width=\"25\" height=\"25\"> <img src=\"http://codebuddies.org/images/icon-mute.png\" alt=\"turn off microphone\" width=\"25\" height=\"25\"></span>",
"raw": "'<span class=\"silent-icons\"> <img src=\"http://codebuddies.org/images/icon-video-off.png\" alt=\"turn off video\" width=\"25\" height=\"25\"> <img src=\"http://codebuddies.org/images/icon-mute.png\" alt=\"turn off microphone\" width=\"25\" height=\"25\"></span>'"
}
]
}
}
]
},
"generator": false,
"expression": false,
"async": false
}
}
},
{
"type": "ExpressionStatement",
"expression": {
"type": "CallExpression",
"callee": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "MemberExpression",
"computed": true,
"object": {
"type": "Identifier",
"name": "Template"
},
"property": {
"type": "CallExpression",
"callee": {
"type": "Identifier",
"name": "getTemplate"
},
"arguments": [
{
"type": "Literal",
"value": "layout",
"raw": "'layout'"
}
]
}
},
"property": {
"type": "Identifier",
"name": "events"
}
},
"arguments": [
{
"type": "ObjectExpression",
"properties": [
{
"type": "Property",
"key": {
"type": "Literal",
"value": "click .inner-wrapper",
"raw": "'click .inner-wrapper'"
},
"computed": false,
"value": {
"type": "FunctionExpression",
"id": null,
"params": [
{
"type": "Identifier",
"name": "e"
}
],
"body": {
"type": "BlockStatement",
"body": [
{
"type": "IfStatement",
"test": {
"type": "CallExpression",
"callee": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "CallExpression",
"callee": {
"type": "Identifier",
"name": "$"
},
"arguments": [
{
"type": "Literal",
"value": "body",
"raw": "'body'"
}
]
},
"property": {
"type": "Identifier",
"name": "hasClass"
}
},
"arguments": [
{
"type": "Literal",
"value": "mobile-nav-open",
"raw": "'mobile-nav-open'"
}
]
},
"consequent": {
"type": "BlockStatement",
"body": [
{
"type": "ExpressionStatement",
"expression": {
"type": "CallExpression",
"callee": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "Identifier",
"name": "e"
},
"property": {
"type": "Identifier",
"name": "preventDefault"
}
},
"arguments": []
}
},
{
"type": "ExpressionStatement",
"expression": {
"type": "CallExpression",
"callee": {
"type": "MemberExpression",
"computed": false,
"object": {
"type": "CallExpression",
"callee": {
"type": "Identifier",
"name": "$"
},
"arguments": [
{
"type": "Literal",
"value": "body",
"raw": "'body'"
}
]
},
"property": {
"type": "Identifier",
"name": "removeClass"
}
},
"arguments": [
{
"type": "Literal",
"value": "mobile-nav-open",
"raw": "'mobile-nav-open'"
}
]
}
}
]
},
"alternate": null
}
]
},
"generator": false,
"expression": false,
"async": false
},
"kind": "init",
"method": false,
"shorthand": false
}
]
}
]
}
}
],
"sourceType": "script"
}
I want the subtree of below mentioned IF cases.
if(currentScroll=Session.get('currentScroll'))
if ($('body').hasClass('mobile-nav-open'))
Is there an easy way to extract this information in Python?
I am looking for some methods or packages in python to solve this problem, instead of completely traversing entire dictionary.
You can use the tree-sitter library for this purpose.
Check out the Usage section in the README file to setup the package.
This is what you need to do at a high-level:
from tree_sitter import Language, Parser
JS_LANGUAGE = Language('build/my-languages.so', 'javascript')
parser = Parser()
parser.set_language(JS_LANGUAGE)
parser.parse(bytes(<code string>, 'utf-8'))
The doubt is similar to the question: BigQuery use_avro_logical_types ignored in Python script however I have already updated the libraries I use google without success. So I would like to understand what is happening in my case. I will send the intake script part of the avro file. Note: The issue occurs in more than one avro file ingestion pipeline for BigQuery.
EDIT: The mentioned solution of changing schema type didn't work, it gave another error.
Schema in BQ:
Data in BQ:
import csv
import base64
import json
import io
import avro.schema
import avro.io
from avro.datafile import DataFileReader, DataFileWriter
import math
import os
import gcloud
from gcloud import storage
from google.cloud import bigquery
from oauth2client.client import GoogleCredentials
from datetime import datetime, timedelta, date
import numpy as np
try:
script_path = os.path.dirname(os.path.abspath(__file__)) + "/"
except:
script_path = "C:\\Users\\me\\key.json"
#Bigquery Credentials and settings
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = script_path
folder = str((datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d'))
data_folder = str((datetime.now() - timedelta(days=1)).strftime('%Y%m%d'))
bucket_name = 'gs://bucket/*.csv'
dataset = 'dataset'
tabela = 'table_ids'
new_file = 'C:\\Users\\me\\register_' + data_folder + '.avro'
file_schema = 'C:\\Users\\me\\schema.avsc'
new_filename = 'register_' + data_folder + '.avro'
bq1 = bigquery.Client()
#Deleta IDs
query1 = """DELETE FROM dataset.table_ids WHERE ID IS NOT NULL"""
query_job1 = bq1.query(query1)
def insert_bigquery(target_uri, dataset_id, table_id):
bigquery_client = bigquery.Client()
dataset_ref = bigquery_client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.schema = [
bigquery.SchemaField('id','STRING',mode='REQUIRED')
]
job_config.source_format = bigquery.SourceFormat.CSV
job_config.field_delimiter = ";"
uri = target_uri
load_job = bigquery_client.load_table_from_uri(
uri,
dataset_ref.table(table_id),
job_config=job_config
)
print('Starting job {}'.format(load_job.job_id))
load_job.result()
print('Job finished.')
insert_bigquery(bucket_name, dataset, tabela)
def get_data_from_bigquery():
"""query bigquery to get data to import to PSQL"""
bq = bigquery.Client()
#Busca IDs
query = """SELECT id FROM dataset.table_ids"""
query_job = bq.query(query)
data = query_job.result()
rows = list(data)
return rows
a = get_data_from_bigquery()
length = len(a)
line_count = 0
schema = avro.schema.Parse(open(file_schema, "rb").read()) # need to know the schema to write. According to 1.8.2 of Apache Avro
writer = DataFileWriter(open(new_file, "wb"), avro.io.DatumWriter(), schema)
for row in range(length):
bytes = base64.b64decode(str(a[row][0]))
bytes = bytes[5:]
buf = io.BytesIO(bytes)
decoder = avro.io.BinaryDecoder(buf)
rec_reader = avro.io.DatumReader(avro.schema.Parse(open(file_schema).read()))
out=rec_reader.read(decoder)
writer.append(out)
writer.close()
def upload_blob(bucket_name, source_file_name, destination_blob_name):
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob("insert/" + destination_blob_name)
blob.upload_from_filename(source_file_name)
print('File {} uploaded to {}'.format(
source_file_name,
destination_blob_name
))
upload_blob('bucket', new_file, new_filename)
def insert_bigquery_avro(target_uri, dataset_id, table_id):
bigquery_client = bigquery.Client()
dataset_ref = bigquery_client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.autodetect = True
job_config.source_format = bigquery.SourceFormat.AVRO
job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
job_config.use_avro_logical_types = True
time_partitioning = bigquery.table.TimePartitioning()
job_config.time_partitioning = time_partitioning
uri = target_uri
load_job = bigquery_client.load_table_from_uri(
uri,
dataset_ref.table(table_id),
job_config=job_config
)
print('Starting job {}'.format(load_job.job_id))
load_job.result()
print('Job finished.')
Avro Schema:
"fields": [
{
"name": "id",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": "the payment id"
},
{
"name": "merchant",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": "the merchant who owns the payment"
},
{
"name": "date",
"type": {
"type": "long",
"logicalType": "timestamp-millis"
},
"doc": "the date where the transaction happend"
},
{
"name": "amount",
"type": {
"type": "record",
"name": "amount",
"fields": [
{
"name": "amount",
"type": [
"null",
{
"type": "bytes",
"logicalType": "decimal",
"precision": 5,
"scale": 5
}
],
"doc": "the original currency amount",
"default": null
},
{
"name": "foreignAmount",
"type": [
"null",
{
"type": "bytes",
"logicalType": "decimal",
"precision": 5,
"scale": 5
}
],
"doc": "the foreign amount for the payment",
"default": null
},
{
"name": "code",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": "the destination currency code"
}
],
"default": null
}
},
{
"name": "exchange_rate",
"type": {
"type": "record",
"name": "code",
"fields": [
{
"name": "currency_code",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "the exchange rate currency code",
"default": null
},
{
"name": "rate",
"type": [
"null",
{
"type": "bytes",
"logicalType": "decimal",
"precision": 5,
"scale": 5
}
],
"default": null
},
{
"name": "online_rate",
"type": [
"null",
{
"type": "bytes",
"logicalType": "decimal",
"precision": 5,
"scale": 5
}
],
"default": null
}
]
},
"doc": "The transaction exchange rate"
},
{
"name": "consumer",
"type": {
"type": "record",
"name": "Consumer",
"fields": [
{
"name": "name",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "the consumer's name",
"default": null
},
{
"name": "email",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "the consumer's email address",
"default": null
},
{
"name": "external_id",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "the consumer's external id when needed",
"default": null
},
{
"name": "national_id",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": "the national id"
},
{
"name": "phone",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": "the consumer's phone number",
"default": ""
}
]
}
},
{
"name": "soft_descriptor",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "the description as it will be shown at the customer's invoice",
"default": null
},
{
"name": "merchant_contract",
"type": {
"type": "enum",
"name": "merchant_contract_type",
"symbols": [
"PAY",
"BANK"
]
},
"default": "PAY"
},
{
"name": "type",
"type": {
"type": "enum",
"name": "payment_type",
"symbols": [
"INITIAL",
"CREDIT_CARD",
"DEBIT_CARD",
"ONLINE_DEBIT",
"BANK_SLIP",
"DIGITAL_WALLET",
"ELECTRONIC_BANK_TRANSFER"
]
},
"default": "INITIAL"
},
{
"name": "card",
"type": {
"type": "record",
"name": "card",
"fields": [
{
"name": "type",
"type": [
"null",
{
"type": "enum",
"name": "card_type",
"symbols": [
"CARD",
"TOKEN"
]
}
],
"default": null
},
{
"name": "mask_number",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"default": null
},
{
"name": "card_holder",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"default": null
},
{
"name": "brand",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"default": null
}
]
}
},
{
"name": "confirm",
"type": "boolean",
"doc": "indicates whether is self confirmed",
"default": false
},
{
"name": "installments",
"type": "int",
"doc": "Number of installments for the payment",
"default": 1
},
{
"name": "due_date",
"type": [
"null",
{
"type": "int",
"logicalType": "date"
}
],
"default": null
},
{
"name": "correlation_id",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "the external customer correlationid",
"default": null
},
{
"name": "billing",
"type": {
"type": "record",
"name": "Billing",
"fields": [
{
"name": "national_id",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "billing info",
"default": null
},
{
"name": "name",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"doc": "the consumer address name",
"default": null
}
]
}
},
{
"name": "status",
"type": {
"type": "enum",
"name": "payment_status",
"symbols": [
"INITIAL",
"CONSUMER",
"AUTHORIZED",
"WAITING_CONFIRMATION",
"CANCELED",
"WAITING_CLEARING",
"CLEARED",
"TRANSFERENCE",
"DECLINED_BY_ISSUER",
"DECLINED_BY_BUSINESS_RULES",
"CONFIRMED",
"WAITING_CANCELING",
"WAITING_CONSUMER",
"TRANSFER_REQUESTED"
],
"default": "INITIAL"
}
},
{
"name": "metadata",
"type": {
"type": "map",
"values": {
"type": "string",
"avro.java.string": "String"
},
"avro.java.string": "String"
}
},
{
"name": "events",
"type": {
"type": "array",
"items": {
"type": "record",
"name": "event",
"fields": [
{
"name": "id",
"type": {
"type": "string",
"avro.java.string": "String"
},
"default": "0"
},
{
"name": "type",
"type": {
"type": "enum",
"name": "event_type",
"symbols": [
"AUTHORIZATION",
"AUTHENTICATION",
"CONFIRMATION",
"CANCELATION",
"CHECKOUT_CREATION",
"SETTLEMENT",
"TRANSFER_VALIDATION",
"TRANSFER_SCHEDULE",
"TRANSFERRED"
]
}
},
{
"name": "gateway",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"default": null
},
{
"name": "breadcrumb_id",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"default": null
},
{
"name": "request_time",
"type": {
"type": "long",
"logicalType": "timestamp-millis"
},
"doc": "the moment where this request was received by the platform"
},
{
"name": "response_time",
"type": {
"type": "long",
"logicalType": "timestamp-millis"
},
"doc": "the moment where this request was returned by the platform"
},
{
"name": "status",
"type": {
"type": "enum",
"name": "event_status",
"symbols": [
"SUCCESS",
"DENIED",
"ERROR",
"TIMEOUT",
"PENDING"
]
}
},
{
"name": "actor",
"type": {
"type": "enum",
"name": "actor",
"symbols": [
"AQ",
"GTW",
"CONCIL"
],
"default": "GTW"
},
"default": "GTW"
},
{
"name": "amount",
"type": [
"null",
{
"type": "bytes",
"logicalType": "decimal",
"precision": 5,
"scale": 5
}
],
"doc": "the original currency amount",
"default": null
},
{
"name": "foreign_amount",
"type": [
"null",
{
"type": "bytes",
"logicalType": "decimal",
"precision": 5,
"scale": 5
}
],
"doc": "the foreign amount for the payment",
"default": null
},
{
"name": "error",
"type": {
"type": "record",
"name": "Error",
"fields": [
{
"name": "code",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"default": null
},
{
"name": "message",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"default": null
}
]
}
},
{
"name": "message",
"type": [
"null",
{
"type": "string",
"avro.java.string": "String"
}
],
"default": null
},
{
"name": "fee_amount",
"type": [
"null",
{
"type": "bytes",
"logicalType": "decimal",
"precision": 5,
"scale": 5
}
],
"doc": "the fee amount",
"default": null
},
{
"name": "net_amount",
"type": [
"null",
{
"type": "bytes",
"logicalType": "decimal",
"precision": 5,
"scale": 5
}
],
"doc": "the net amount",
"default": null
},
{
"name": "metadata",
"type": {
"type": "map",
"values": {
"type": "string",
"avro.java.string": "String"
},
"avro.java.string": "String"
}
},
{
"name": "internal_metadata",
"type": {
"type": "map",
"values": {
"type": "string",
"avro.java.string": "String"
},
"avro.java.string": "String"
}
}
]
}
}
},
{
"name": "bank_account",
"type": [
"null",
{
"type": "record",
"name": "bank_account",
"fields": [
{
"name": "name",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": "the bank name",
"default": ""
},
{
"name": "code",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": "the bank code",
"default": ""
},
{
"name": "agency",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": "the bank agency",
"default": ""
},
{
"name": "account",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": "the bank account",
"default": ""
},
{
"name": "document_number",
"type": {
"type": "string",
"avro.java.string": "String"
},
"doc": "the bank document number (CNPJ)",
"default": ""
}
]
}
],
"doc": "The bank account values",
"default": null
}
]
Try setting this as "type" for all timestamps columns on your avro schema:
"type": ["null", {"type": "long", "logicalType": "timestamp-millis"}]
The following deploys a azure function that run the specified C#. How do I do the same for a function that should run python?
I tried just changing the name to __init__.py as is generated when you use the azure-function-core-tools func command with the --python switch, but couldn't even find error messages as to why things weren't working.
{
"$schema": "http://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"appName": {
"type": "string",
"metadata": {
"description": "The name of the function app that you wish to create."
}
},
"storageAccountType": {
"type": "string",
"defaultValue": "Standard_LRS",
"allowedValues": [
"Standard_LRS",
"Standard_GRS",
"Standard_ZRS",
"Premium_LRS"
],
"metadata": {
"description": "Storage Account type"
}
}
},
"variables": {
"functionAppName": "[parameters('appName')]",
"hostingPlanName": "[parameters('appName')]",
"storageAccountName": "[concat(uniquestring(resourceGroup().id), 'azfunctions')]"
},
"resources": [
{
"type": "Microsoft.Storage/storageAccounts",
"name": "[variables('storageAccountName')]",
"apiVersion": "2015-06-15",
"location": "[resourceGroup().location]",
"properties": {
"accountType": "[parameters('storageAccountType')]"
}
},
{
"type": "Microsoft.Web/serverfarms",
"apiVersion": "2015-04-01",
"name": "[variables('hostingPlanName')]",
"location": "[resourceGroup().location]",
"properties": {
"name": "[variables('hostingPlanName')]",
"computeMode": "Dynamic",
"sku": "Dynamic"
}
},
{
"apiVersion": "2015-08-01",
"type": "Microsoft.Web/sites",
"name": "[variables('functionAppName')]",
"location": "[resourceGroup().location]",
"kind": "functionapp",
"properties": {
"name": "[variables('functionAppName')]",
"serverFarmId": "[resourceId('Microsoft.Web/serverfarms', variables('hostingPlanName'))]"
},
"dependsOn": [
"[resourceId('Microsoft.Web/serverfarms', variables('hostingPlanName'))]",
"[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName'))]"
],
"resources": [
{
"apiVersion": "2016-03-01",
"name": "appsettings",
"type": "config",
"dependsOn": [
"[resourceId('Microsoft.Web/sites', variables('functionAppName'))]",
"[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName'))]"
],
"properties": {
"AzureWebJobsStorage": "[concat('DefaultEndpointsProtocol=https;AccountName=',variables('storageAccountName'),';AccountKey=',listkeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName')), '2015-05-01-preview').key1,';')]",
"AzureWebJobsDashboard": "[concat('DefaultEndpointsProtocol=https;AccountName=',variables('storageAccountName'),';AccountKey=',listkeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName')), '2015-05-01-preview').key1,';')]",
"FUNCTIONS_EXTENSION_VERSION": "latest"
}
},
{
"apiVersion": "2015-08-01",
"name": "TestFunctionCM",
"type": "functions",
"dependsOn": [
"[resourceId('Microsoft.Web/sites', variables('functionAppName'))]"
],
"properties": {
"config": {
"bindings": [
{
"authLevel": "anonymous",
"name": "req",
"type": "httpTrigger",
"direction": "in"
},
{
"name": "res",
"type": "http",
"direction": "out"
}
]
},
"files": {
"run.csx": "using System.Net;\r\n\r\n public static HttpResponseMessage Run(HttpRequestMessage req, TraceWriter log)\r\n\r\n {\r\n\r\nreturn req.CreateResponse(\"Hello from MyFunction\", HttpStatusCode.OK);\r\n\r\n }"
}
}
}
]
}
]
}
Thank you.
You will probably need the following:
Runtime under appsettings
"FUNCTIONS_WORKER_RUNTIME": "python"
My template looks bit different but does deploy a python function, here is the resource from the same:
{
"type": "Microsoft.Web/sites",
"apiVersion": "2018-11-01",
"name": "[parameters('name')]",
"location": "[parameters('location')]",
"dependsOn": [
"microsoft.insights/components/mycoolfunction",
"[concat('Microsoft.Web/serverfarms/', parameters('hostingPlanName'))]",
"[concat('Microsoft.Storage/storageAccounts/', parameters('storageAccountName'))]"
],
"tags": {},
"kind": "functionapp,linux",
"properties": {
"name": "[parameters('name')]",
"siteConfig": {
"appSettings": [
{
"name": "FUNCTIONS_WORKER_RUNTIME",
"value": "python"
},
{
"name": "FUNCTIONS_EXTENSION_VERSION",
"value": "~2"
},
{
"name": "APPINSIGHTS_INSTRUMENTATIONKEY",
"value": "[reference('microsoft.insights/components/mycoolfunction', '2015-05-01').InstrumentationKey]"
},
{
"name": "AzureWebJobsStorage",
"value": "[concat('DefaultEndpointsProtocol=https;AccountName=',parameters('storageAccountName'),';AccountKey=',listKeys(resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccountName')), '2019-06-01').keys[0].value,';EndpointSuffix=','core.windows.net')]"
}
]
},
"serverFarmId": "[concat('/subscriptions/', parameters('subscriptionId'),'/resourcegroups/', parameters('serverFarmResourceGroup'), '/providers/Microsoft.Web/serverfarms/', parameters('hostingPlanName'))]",
"hostingEnvironment": "[parameters('hostingEnvironment')]",
"clientAffinityEnabled": false
}
}
I have this json output and I want to make sure that top_properties is not empty.
In top_properties the key value is dynamic and they are not static. That is where I'm stuck at.
{
"id": "test",
"name": "name",
"cake_name": "test",
"metric": 0.5,
"anticipations": [
{
"time": "2018-01-01 00:00:00",
"points": 0.49128797804879504,
"top_properties": {
"LA:TB2341": 0.23,
"LA:TB2342": 0.23,
"LA:TB2343": 0.23
},
"status": 0,
"alert": false
}
I have below schema but It wont fail when top_properties is empty. I want to make sure it fails when its empty.
{
"definitions": {},
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "http://example.com/root.json",
"type": "object",
"title": "The Root Schema",
"required": [
"id",
"name",
"cake_name",
"metric",
"anticipations"
],
"properties": {
"id": {
"$id": "#/properties/id",
"type": "string",
"title": "The Id Schema",
"default": "",
"examples": [
"test"
],
"pattern": "^(.*)$"
},
"name": {
"$id": "#/properties/name",
"type": "string",
"title": "The Name Schema",
"default": "",
"examples": [
"name"
],
"pattern": "^(.*)$"
},
"cake_name": {
"$id": "#/properties/cake_name",
"type": "string",
"title": "The Cake_name Schema",
"default": "",
"examples": [
"test"
],
"pattern": "^(.*)$"
},
"metric": {
"$id": "#/properties/metric",
"type": "number",
"title": "The Metric Schema",
"default": 0.0,
"examples": [
0.5
]
},
"anticipations": {
"$id": "#/properties/anticipations",
"type": "array",
"title": "The Anticipations Schema",
"items": {
"$id": "#/properties/anticipations/items",
"type": "object",
"title": "The Items Schema",
"required": [
"time",
"points",
"top_properties",
"status",
"alert"
],
"properties": {
"time": {
"$id": "#/properties/anticipations/items/properties/time",
"type": "string",
"title": "The Time Schema",
"default": "",
"examples": [
"2018-01-01 00:00:00"
],
"pattern": "^(.*)$"
},
"points": {
"$id": "#/properties/anticipations/items/properties/points",
"type": "number",
"title": "The Points Schema",
"default": 0.0,
"examples": [
0.49128797804879504
]
},
"top_properties": {
"$id": "#/properties/anticipations/items/properties/top_properties",
"type": "object",
"title": "The Top_properties Schema",
"patternProperties": {
"[A-Za-z:0-9]": {
"type": "number"
}
},
"additionalProperties": false
},
"status": {
"$id": "#/properties/anticipations/items/properties/status",
"type": "integer",
"title": "The Status Schema",
"default": 0,
"examples": [
0
]
},
"alert": {
"$id": "#/properties/anticipations/items/properties/alert",
"type": "boolean",
"title": "The Alert Schema",
"default": false,
"examples": [
false
]
}
}
}
}
}
}
How do I use required for pattern properties since I don't have static value, how is it implemented if you have come across this situation.
You want the minProperties keyword https://json-schema.org/understanding-json-schema/reference/object.html#size
For example,
{
"type": "object",
"patternProperties": {
"[A-Za-z:0-9]": { "type": "number" }
},
"minProperties": 1
}
I have a large JSON file, about 5 million records and a file size of about 32GB, that I need to get loaded into our Snowflake Data Warehouse. I need to get this file broken up into chunks of about 200k records (about 1.25GB) per file. I'd like to do this in either Node.JS or Python for deployment to an AWS Lambda function, unfortunately I haven't coded in either, yet. I have C# and a lot of SQL experience, and learning both node and python are on my to do list, so why not dive right in, right!?
My first question is "Which language would better serve this function? Python, or Node.JS?"
I know I don't want to read this entire JSON file into memory (or even the output smaller file). I need to be able to "stream" it in and out into the new file based on a record count (200k), properly close up the json objects, and continue into a new file for another 200k, and so on. I know Node can do this, but if Python can also do this, I feel like it would be easier to quickly start using for other ETL stuff I'll be doing soon.
My second question is "Based on your recommendation above, can you also recommend what modules I should require/import to help me get started? Primarily as it relates to not pulling the entire json file into memory? Maybe some tips, tricks, or 'How would you do it's? And if you're feeling really generous, some code example to help push me into the deep end on this?
I can't include a sample of the JSON data, as it contains personal information. But I can provide the JSON schema ...
{
"$schema": "http://json-schema.org/draft-04/schema#",
"items": {
"properties": {
"activities": {
"properties": {
"activity_id": {
"items": {
"type": "integer"
},
"type": "array"
},
"frontlineorg_id": {
"items": {
"type": "integer"
},
"type": "array"
},
"import_id": {
"items": {
"type": "integer"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"is_source": {
"items": {
"type": "boolean"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"address": {
"properties": {
"city": {
"items": {
"type": "string"
},
"type": "array"
},
"congress_dist_name": {
"items": {
"type": "string"
},
"type": "array"
},
"congress_dist_number": {
"items": {
"type": "integer"
},
"type": "array"
},
"congress_end_yr": {
"items": {
"type": "integer"
},
"type": "array"
},
"congress_number": {
"items": {
"type": "integer"
},
"type": "array"
},
"congress_start_yr": {
"items": {
"type": "integer"
},
"type": "array"
},
"county": {
"items": {
"type": "string"
},
"type": "array"
},
"formatted": {
"items": {
"type": "string"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"latitude": {
"items": {
"type": "number"
},
"type": "array"
},
"longitude": {
"items": {
"type": "number"
},
"type": "array"
},
"number": {
"items": {
"type": "string"
},
"type": "array"
},
"observes_dst": {
"items": {
"type": "boolean"
},
"type": "array"
},
"post_directional": {
"items": {
"type": "null"
},
"type": "array"
},
"pre_directional": {
"items": {
"type": "null"
},
"type": "array"
},
"school_district": {
"items": {
"properties": {
"school_dist_name": {
"items": {
"type": "string"
},
"type": "array"
},
"school_dist_type": {
"items": {
"type": "string"
},
"type": "array"
},
"school_grade_high": {
"items": {
"type": "string"
},
"type": "array"
},
"school_grade_low": {
"items": {
"type": "string"
},
"type": "array"
},
"school_lea_code": {
"items": {
"type": "integer"
},
"type": "array"
}
},
"type": "object"
},
"type": "array"
},
"secondary_number": {
"items": {
"type": "null"
},
"type": "array"
},
"secondary_unit": {
"items": {
"type": "null"
},
"type": "array"
},
"state": {
"items": {
"type": "string"
},
"type": "array"
},
"state_house_dist_name": {
"items": {
"type": "string"
},
"type": "array"
},
"state_house_dist_number": {
"items": {
"type": "integer"
},
"type": "array"
},
"state_senate_dist_name": {
"items": {
"type": "string"
},
"type": "array"
},
"state_senate_dist_number": {
"items": {
"type": "integer"
},
"type": "array"
},
"street": {
"items": {
"type": "string"
},
"type": "array"
},
"suffix": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"timezone": {
"items": {
"type": "string"
},
"type": "array"
},
"utc_offset": {
"items": {
"type": "integer"
},
"type": "array"
},
"zip": {
"items": {
"type": "integer"
},
"type": "array"
}
},
"type": "object"
},
"age": {
"type": "integer"
},
"anniversary": {
"properties": {
"date": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"baptism": {
"properties": {
"church_id": {
"type": "null"
},
"date": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"birth_dd": {
"type": "integer"
},
"birth_mm": {
"type": "integer"
},
"birth_yyyy": {
"type": "integer"
},
"church_attendance": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"likelihood": {
"items": {
"type": "integer"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"cohabiting": {
"properties": {
"confidence": {
"items": {
"type": "string"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"likelihood": {
"items": {
"type": "null"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"dating": {
"properties": {
"bool": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"divorced": {
"properties": {
"bool": {
"items": {
"type": "null"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"likelihood_considering": {
"items": {
"type": "integer"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"education": {
"properties": {
"est_level": {
"items": {
"type": "string"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"email": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"is_work_school": {
"items": {
"type": "boolean"
},
"type": "array"
},
"string": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"engaged": {
"properties": {
"insert_datetime_utc": {
"type": "null"
},
"likelihood": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"est_income": {
"properties": {
"est_level": {
"items": {
"type": "string"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"ethnicity": {
"type": "string"
},
"first_name": {
"type": "string"
},
"formatted_birthdate": {
"type": "string"
},
"gender": {
"type": "string"
},
"head_of_household": {
"properties": {
"bool": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"home_church": {
"properties": {
"church_id": {
"type": "null"
},
"group_participant": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"is_coaching": {
"type": "null"
},
"is_giving": {
"type": "null"
},
"is_serving": {
"type": "null"
},
"membership_date": {
"type": "null"
},
"regular_attendee": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"hub_poid": {
"type": "integer"
},
"insert_datetime_utc": {
"type": "string"
},
"ip_address": {
"properties": {
"insert_datetime_utc": {
"type": "null"
},
"string": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"last_name": {
"type": "string"
},
"marriage_segment": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"string": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"married": {
"properties": {
"bool": {
"items": {
"type": "boolean"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"middle_name": {
"type": "string"
},
"miscellaneous": {
"properties": {
"attribute": {
"items": {
"type": "string"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"value": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"name_suffix": {
"type": "null"
},
"name_title": {
"type": "null"
},
"newlywed": {
"properties": {
"bool": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"parent": {
"properties": {
"bool": {
"items": {
"type": "boolean"
},
"type": "array"
},
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"likelihood_expecting": {
"items": {
"type": "integer"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"person_id": {
"type": "integer"
},
"phone": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"number": {
"items": {
"type": "integer"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"type": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"property_rights": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"string": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"psychographic_cluster": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"string": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"religion": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"string": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"religious_segment": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"string": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"separated": {
"properties": {
"bool": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"significant_other": {
"properties": {
"first_name": {
"type": "null"
},
"insert_datetime_utc": {
"type": "null"
},
"last_name": {
"type": "null"
},
"middle_name": {
"type": "null"
},
"name_suffix": {
"type": "null"
},
"name_title": {
"type": "null"
},
"suppressed_datetime_utc": {
"type": "null"
}
},
"type": "object"
},
"suppressed_datetime_utc": {
"type": "string"
},
"target_group": {
"properties": {
"insert_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
},
"string": {
"items": {
"type": "string"
},
"type": "array"
},
"suppressed_datetime_utc": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
}
},
"type": "object"
},
"type": "array"
}
Use this code in linux command prompt
split -b 53750k <your-file>
cat xa* > <your-file>
Refer to this link:
https://askubuntu.com/questions/28847/text-editor-to-edit-large-4-3-gb-plain-text-file
Answering the question whether Python or Node will be better for the task would be an opinion and we are not allowed to voice our opinions on Stack Overflow. You have to decide yourself what you have more experience in and what you want to work with - Python or Node.
If you go with Node, there are some modules that can help you with that task, that do streaming JSON parsing. E.g. those modules:
https://www.npmjs.com/package/JSONStream
https://www.npmjs.com/package/stream-json
https://www.npmjs.com/package/json-stream
If you go with Python, there are streaming JSON parsers here as well:
https://github.com/kashifrazzaqui/json-streamer
https://github.com/danielyule/naya
http://www.enricozini.org/blog/2011/tips/python-stream-json/
consider to use jq to preprocessing your json files
it could split and stream your large json files
jq is like sed for JSON data - you can use it to slice
and filter and map and transform structured data with
the same ease that sed, awk, grep and friends let you play with text.
see the official documentation and this questions for more.
extra: for your first questions jq is written by C, it's faster than python/node isn't it ?
Snowflake has a very special treatment for JSON and if we understand them, it would be easy to draw the design.
JSON/Parquet/Avro/XML is considered as semi-structure data
They are stored as Variant data type in Snowflake.
While loading the JSON data into stage location, flag the strip_outer_array=true
copy into <table>
from #~/<file>.json
file_format = (type = 'JSON' strip_outer_array = true);
Each row size can not exceed 16Mb compressed when loaded in snowflake.
Snowflake data loading works well if the file size is split in the range of 10-100Mb in size.
Use the utilities which can split the file based on per line and have the file size note more than 100Mb and that brings the power of parallelism as well as accuracy for your data.
As per your data set size, you will get around 31K small files (of 100Mb size).
It means that the 31k parallel process run, however, it is not possible.
So choose an x-large size warehouse (16 v-core & 32 threads)
31k/32 = (approximately) 1000 rounds
This will not take more than a few minutes to load data based on your network bandwidth. Even if we think of 3sec per round, it may load the data in 50min.
Look at the warehouse configuration & throughput details and refer semi-structured data loading best practice.
The easiest approach that worked for me was this:
json_file = <your_file>
chunks = 200
for i in range(0,len(json_file), chunks):
print(json_file[i:i+chunks])
To split and compress at the same time with bash, resulting in files of ~100MB each:
cat bigfile.json | split -C 1000000000 -d -a4 - output_prefix --filter='gzip > $FILE.gz'
See more: https://stackoverflow.com/a/68718176/132438
You can use Python3 with the following script:
import json
def split_json(file_path):
with open(file_path, 'r') as json_file:
data = json.load(json_file)
chunk_size = len(data) // 3
for i in range(3):
with open(f"part{i}.json", 'w') as outfile:
outfile.write(json.dumps(data[i*chunk_size:(i+1)*chunk_size]))
file_path = input("Enter the file path of the JSON file: ")
split_json(file_path)