How to stream Csv file into BigQuery? - python

Examples I found so far is streaming json to BQ, e.g. https://cloud.google.com/bigquery/streaming-data-into-bigquery
How do I stream Csv or any file type into BQ? Below is a block of code for streaming and seems "issue" is in insert_all_data where 'row' defined as json.. thanks
# [START stream_row_to_bigquery]
def stream_row_to_bigquery(bigquery, project_id, dataset_id, table_name, row,
num_retries=5):
insert_all_data = {
'rows': [{
'json': row,
# Generate a unique id for each row so retries don't accidentally
# duplicate insert
'insertId': str(uuid.uuid4()),
}]
}
return bigquery.tabledata().insertAll(
projectId=project_id,
datasetId=dataset_id,
tableId=table_name,
body=insert_all_data).execute(num_retries=num_retries)
# [END stream_row_to_bigquery]

This is how I wrote using bigquery-python library very easily.
def insert_data(datasetname,table_name,DataObject):
client = get_client(project_id, service_account=service_account,
private_key_file=key, readonly=False, swallow_results=False)
insertObject = DataObject
try:
result = client.push_rows(datasetname,table_name,insertObject)
except Exception, err:
print err
raise
return result
Here insertObject is a list of dictionaries where one dictionary contains one row.
eg: [{field1:value1, field2:value2},{field1:value3, field2:value4}]
csv can be read as follows,
import pandas as pd
fileCsv = pd.read_csv(file_path+'/'+filename, parse_dates=C, infer_datetime_format=True)
data = []
for row_x in range(len(fileCsv.index)):
i = 0
row = {}
for col_y in schema:
row[col_y['name']] = _sorted_list[i]['col_data'][row_x]
i += 1
data.append(row)
insert_data(datasetname,table_name,data)
data list can be sent to the insert_data
This will do that but still there's a limitation that I already raised here.

Related

AWS Glue create_partition using boto3 successful, but Athena not showing results for query

I have a glue script to create new partitions using create_partition(). The glue script is running successfully, and i could see the partitions in the Athena console when using SHOW PARTITIONS. For glue script create_partitions, I did refer to this sample code here : https://medium.com/#bv_subhash/demystifying-the-ways-of-creating-partitions-in-glue-catalog-on-partitioned-s3-data-for-faster-e25671e65574
When I try to run a Athena query for a given partition which was newly added, I am getting no results.
Is it that I need to trigger the MSCK command, even if I add the partitions using create_partitions. Appreciate any suggestions please
.
I have got the solution myself, wanted to share with SO community, so it would be useful someone. The following code when run as a glue job, creates partitions, and can also be queried in Athena for the new partition columns. Please change/add the parameter values db name, table name, partition columns as needed.
import boto3
import urllib.parse
import os
import copy
import sys
# Configure database / table name and emp_id, file_id from workflow params?
DATABASE_NAME = 'my_db'
TABLE_NAME = 'enter_table_name'
emp_id_tmp = ''
file_id_tmp = ''
# # Initialise the Glue client using Boto 3
glue_client = boto3.client('glue')
#get current table schema for the given database name & table name
def get_current_schema(database_name, table_name):
try:
response = glue_client.get_table(
DatabaseName=DATABASE_NAME,
Name=TABLE_NAME
)
except Exception as error:
print("Exception while fetching table info")
sys.exit(-1)
# Parsing table info required to create partitions from table
table_data = {}
table_data['input_format'] = response['Table']['StorageDescriptor']['InputFormat']
table_data['output_format'] = response['Table']['StorageDescriptor']['OutputFormat']
table_data['table_location'] = response['Table']['StorageDescriptor']['Location']
table_data['serde_info'] = response['Table']['StorageDescriptor']['SerdeInfo']
table_data['partition_keys'] = response['Table']['PartitionKeys']
return table_data
#prepare partition input list using table_data
def generate_partition_input_list(table_data):
input_list = [] # Initializing empty list
part_location = "{}/emp_id={}/file_id={}/".format(table_data['table_location'], emp_id_tmp, file_id_tmp)
input_dict = {
'Values': [
emp_id_tmp, file_id_tmp
],
'StorageDescriptor': {
'Location': part_location,
'InputFormat': table_data['input_format'],
'OutputFormat': table_data['output_format'],
'SerdeInfo': table_data['serde_info']
}
}
input_list.append(input_dict.copy())
return input_list
#create partition dynamically using the partition input list
table_data = get_current_schema(DATABASE_NAME, TABLE_NAME)
input_list = generate_partition_input_list(table_data)
try:
create_partition_response = glue_client.batch_create_partition(
DatabaseName=DATABASE_NAME,
TableName=TABLE_NAME,
PartitionInputList=input_list
)
print('Glue partition created successfully.')
print(create_partition_response)
except Exception as e:
# Handle exception as per your business requirements
print(e)

Azure Function to CosmosDB

Need help on getting a function that would take a json and write the values to a cosmos DB. Everything I have read shows only single parameters.
name = req.params.get('name')
if not name:
try:
req_body = req.get_json()
except ValueError:
pass
else:
name = req_body.get('name')
if name:
count = 1
try:
counter = container_client.read_item(item=name, partition_key=name)
counter['count'] += 1
container_client.replace_item(item=counter['id'], body=counter)
count = counter['count']
except exceptions.CosmosResourceNotFoundError:
# Create new item
container_client.create_item({'id': name, 'count': count})
return func.HttpResponse(f"Hello, {name}! Current count is {count}.")
This code works but would like something {name:Kyle, job:engineer} and these get added to table.
I am following this blog to achieve your requirement.
Try to add the json values in the below format and insert them into cosmos DB.
if name:
newdocs = func.DocumentList()
#creating the userdetails as json in container of a cosmosdb
newproduct_dict = {
"id": str(uuid.uuid4()),
"name": name
}
newdocs.append(func.Document.from_dict(newproduct_dict))
doc.set(newdocs)
by using this I can able to add the JSON values in Cosmos DB.

Read json data from covid19 api using python

I was trying to import timeseries data from link Covid_data to get the daily historical and 7 day moving average data.But my code doesn't work. I am new to this so maybe my key value pair is not correct. The structure of the file is given here json_structure_link.
My Code
import requests
import pandas as pd
response = requests.get("https://api.covid19india.org/v4/min/timeseries.min.json")
if response.status_code == 200:
historical_day_numbers = response.json()
DATE = []
STATE = []
TOTAL_CASES = []
RECOVERED = []
DECEASED = []
TESTED = []
VACCINATED = []
for state in historical_day_numbers.keys():
STATE.append(state)
DATE.append(historical_day_numbers[state]["dates"])
TOTAL_CASES.append(historical_day_numbers[state]["dates"]["delta"]["confirmed"])
RECOVERED.append(historical_day_numbers[state]["dates"]["delta"]["recovered"])
DECEASED.append(historical_day_numbers[state]["dates"]["delta"]["deceased"])
TESTED.append(historical_day_numbers[state]["dates"]["delta"]["tested"])
VACCINATED.append(historical_day_numbers[state]["dates"]["delta"]["vaccinated"])
Covid19_historical_data = pd.DataFrame(
{
"STATE/UT": STATE,
"DATE": DATE,
"TOTAL_CASES": TOTAL_CASES,
"RECOVERED": RECOVERED,
"DECEASED": DECEASED,
"TESTED": TESTED,
"VACCINATED": VACCINATED,
}
)
#print(data.head())
else:
print("Error while calling API: {}".format(response.status_code, response.reason))
The error I am getting
KeyError: 'delta'
But I see the delta present.
historical_day_numbers[state]['dates'].keys()
Output: dict_keys(['2020-04-06', '2020-04-07', '2020-04-08', '2020-04-09', '2020-04-10', '2020-04-11', '2020-04-12', '2020-04-13', '2020-04-14', '2020-04-15', '2020-04-16', '2020-04-17', '2020-04-18', '2020-04-19', '2020-04-20', '2020-04-21',...])
When you type, you will realize that there is a key for each date and there is no key called 'delta' here.
If you edit your code as follows, you will not get this error.
historical_day_numbers[state]['dates']['2021-07-25']['delta']

Transfer csv to elasticsearch from Python with document_id as csv field

wanted to transfer following csv to elsticsearch
|hcode|hname|
|1|aaaa|
|2|bbbbb|
|3|ccccc|
|4|dddd|
|5|eeee|
|6|ffff|
and need to insert hcode field as document_id. getting below error
File "C:\Users\Namali\Anaconda3\lib\site-packages\elasticsearch\connection\base.py", line 181, in _raise_error
status_code, error_message, additional_info
RequestError: RequestError(400, 'mapper_parsing_exception', 'failed to parse')"
use elasticseach version is 7.1.1 and python vervion is 3.7.6
Python code-----------------------------------------------------------------
import csv
import json
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
def csv_reader(file_obj, delimiter=','):
reader_ = csv.reader(file_obj,delimiter=delimiter,quotechar='"')
i = 1
results = []
for row in reader_:
#try :
#es.index(index='hb_hotel_raw', doc_type='hb_hotel_raw', id=row[0],
# body=json.dump([row for row in reader_], file_obj))
es.index(index='test', doc_type='test', id=row[0],body=json.dumps(row))
#except:
# print("error")
i = i + 1
results.append(row)
print(row)
if __name__ == "__main__":
with open("D:\\namali\\rez\\data_mapping\\test.csv") as f_obj:
csv_reader(f_obj)
First, doc_type is omitted in the elasticsearch 7. Second, you need to pass a valid json to elasticsearch. I edited your code as below:
for row in reader_:
_id = row[0].split("|")[1]
text = row[0].split("|")[2]
my_dict = {"hname" : text}
es.index(index='test', id=_id, body=my_dict)
<disclosure: I'm the developer of Eland and employed by Elastic>
If you're willing to load the CSV into a Pandas DataFrame you can use Eland to create/append the tabular data to an Elasticsearch index with all data types resolved properly.
I would recommend reading pandas.read_csv() and eland.pandas_to_eland() function documentation for ideas on how to accomplish this.

What is the most efficient way of retrieving the port value from this json list

I have the below list from which I have to retrieve the port number I want the value 50051 but what I get is port=50051 I know I can retrieve this by iterating the list and using string operations but wanted to see if there is some direct way to access this.
r = requests.get(url_service)
data = {}
data = r.json()
#Below is the json after printing
[{'ServerTag': [ 'abc-service=true',
'port=50051',
'protocol=http']
}]
print(data[0]["ServiceTags"][1]) // prints port=50051
You can do something like this perhaps:
received_dic = {
'ServerTag': [ 'abc-service=true',
'port=50051',
'protocol=http']
}
ServerTag = received_dic.get("ServerTag", None)
if ServerTag:
port = list(filter(lambda x: "port" in x, ServerTag))[0].split("=")[1]
print(port)
# 50051
Considering you have the following JSON:
[
{
"ServerTag": ["abc-service=true", "port=50051", "protocol=http"]
}
]
You can extract your value like this:
from functools import partial
# ...
def extract_value_from_tag(tags, name, default=None):
tags = map(partial(str.split, sep='='), tags)
try:
return next(value for key, value in tags if key == name)
except StopIteration:
# Tag was not found
return default
And then you just:
# Providing data is the deserialized JSON as a Python list
# Also assuming that data is not empty and ServerTag is present on the first object
tags = data[0].get('ServerTag', [])
port_number = extract_value_from_tag(tags, 'port', default='8080')

Categories

Resources