Create Avro file directly to Google Cloud Storage - python

I'd like to skip the step of creating an avro file locally and uploading it directly to Google Cloud Storage.
I checked the blob.upload from_string option but honestly I don't know what it should replace to apply to my code. And I don't know if it's the best way out for what I need. With that I could build a more modern pipeline by including the script inside a docker image.
This can be done somehow based on the script below:
import csv
import base64
import json
import io
import avro.schema
import avro.io
from avro.datafile import DataFileReader, DataFileWriter
import math
import os
import gcloud
from gcloud import storage
from google.cloud import bigquery
from oauth2client.client import GoogleCredentials
from datetime import datetime, timedelta
import numpy as np
try:
script_path = os.path.dirname(os.path.abspath(__file__)) + "/"
except:
script_path = "C:\\Users\\me\\Documents\\Keys\\key.json"
#Bigquery Credentials and settings
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = script_path
folder = str((datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d'))
bucket_name = 'gs://new_bucket/table/*.csv'
dataset = 'dataset'
tabela = 'table'
schema = avro.schema.Parse(open("C:\\Users\\me\\schema_table.avsc", "rb").read())
writer = DataFileWriter(open("C:\\Users\\me\\table_register.avro", "wb"), avro.io.DatumWriter(), schema)
def insert_bigquery(target_uri, dataset_id, table_id):
bigquery_client = bigquery.Client()
dataset_ref = bigquery_client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.schema = [
bigquery.SchemaField('id','STRING',mode='REQUIRED')
]
job_config.source_format = bigquery.SourceFormat.CSV
job_config.field_delimiter = ";"
uri = target_uri
load_job = bigquery_client.load_table_from_uri(
uri,
dataset_ref.table(table_id),
job_config=job_config
)
print('Starting job {}'.format(load_job.job_id))
load_job.result()
print('Job finished.')
#insert_bigquery(bucket_name, dataset, tabela)
def get_data_from_bigquery():
"""query bigquery to get data to import to PSQL"""
bq = bigquery.Client()
#Busca IDs
query = """SELECT id FROM dataset.base64_data"""
query_job = bq.query(query)
data = query_job.result()
rows = list(data)
return rows
a = get_data_from_bigquery()
length = len(a)
line_count = 0
for row in range(length):
bytes = base64.b64decode(str(a[row][0]))
bytes = bytes[5:]
buf = io.BytesIO(bytes)
decoder = avro.io.BinaryDecoder(buf)
rec_reader = avro.io.DatumReader(avro.schema.Parse(open("C:\\Users\\me\\schema_table.avsc").read()))
out=rec_reader.read(decoder)
writer.append(out)
writer.close()
def upload_blob(bucket_name, source_file_name, destination_blob_name):
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob("insert_transfer/" + destination_blob_name)
blob.upload_from_filename(source_file_name)
print('File {} uploaded to {}'.format(
source_file_name,
destination_blob_name
))
upload_blob('new_bucket', 'C:\\Users\\me\\table_register.avro', 'table_register.avro')

I have seen your script and I can see that you are getting data from BigQuery. I can confirm you that I reproduced your scenario and I am able to export data from BigQuery to Google Cloud Storage directly, without creating the avro file locally.
I suggest you to take a look here where it describes how to export table data from BigQuery to Google Cloud Storage. Here are the steps to follow:
Open the BigQuery web UI in your Cloud Console.
In the navigation panel, in the Resources section, expand your project and click
your dataset to expand it. Find and click the table that contains the data you're
exporting.
On the right side of the window, click Export then select Export to Cloud Storage
In the Export to Cloud Storage dialog:
For Select Cloud Storage location, browse for the bucket.
For Export format, choose the format for your exported data, in your specific
case, choose “Avro”.
Click Export.
Nonetheless, there’s also the possibility to do it with Python. I recommend you to take a look here.
I hope this approach works for you.

Related

How to check if a file with partial name exists in GCS bucket in Python?

I'm trying to write a Python code which will check for a particular file in a folder of Cloud storage bucket and if the file exists then it has to enter the loop. Otherwise exit the loop. File name I want to check is a daily incremental file which starts with "DL_Ticket_".
Actual file name is like : DL_Ticket_merged_20220406201501.csv
Filepath is like : gs://standard/inbound/DL_Ticket_merged_20220406201501.csv
For this functionality I've written code as shown below.
from google.cloud import storage
from googleapiclient.discovery import build
import time
def df_load_function(file, context):
name = 'DL_Ticket_'
storage_client = storage.Client()
bucket_name = 'standard'
bucket = storage_client.bucket(bucket_name)
stats = storage.Blob(bucket=bucket, name="inbound/DL_Ticket_").exists(storage_client)
print(stats)
if stats == True:
#my code
else:
exit()
With the above code, I'm always getting False even though the file exists in the bucket.
I'm implementing this code in Cloud Function of GCP. No errors while deploying the code. Can someone please help me in achieving this.
Code using list_blobs with prefix :
from google.cloud import storage
from googleapiclient.discovery import build
import time
import os
def mv_blob(bucket_name, blob_name):
client = storage.Client()
bucket_name = "standard"
bucket = client.bucket(bucket_name)
blobs = list(bucket.list_blobs(prefix='inbound/DL_Ticket_'))
for blob in blobs:
blob_name = blob.name
print("{}.".format(blob_name))
if blob_name != '':
def df_load_function(file, context):
filesnames = [
'DL_Customer_',
'DL_RegisteredProducts_',
'DL_Ticket_'
]
for i in filesnames:
if 'inbound/{}'.format(i) in file['name']:
print("Processing file: {filename}".format(filename=file['name']))
project = 'nonprod'
inputfile = 'gs://standard/inbound/' + file['name']
job = 'df_load_wave1_{}'.format(i)
template = 'gs://standard/template/df_load_wave1_{}'.format(i)
location = 'asia-south1'
dataflow = build('dataflow', 'v1b3', cache_discovery=False)
request = dataflow.projects().locations().templates().launch(
projectId=project,
gcsPath=template,
location=location,
body={
'jobName': job,
"environment": {
"workerZone": "asia-south1-a"
}
}
)
# Execute the dataflowjob
response = request.execute()
job_id = response["job"]["id"]

Loading data directly from BigQuery to Orange

I would like to ask you if there is possibility in Orange to load to it data directly eg. from BigQuery? I added block "Python script" to the flow and my script looks like that:
import os
import sys
import Orange
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'path toapplication_default_credentials.json'
from google.cloud import storage
from google.cloud import bigquery
from google.cloud import secretmanager
bigquery_client = google.cloud.bigquery.Client(project='my project')
secret_client = secretmanager.SecretManagerServiceClient()
query ="""
my query
"""
query_job = bigquery_client.query(query)
out_data = query_job
I was trying to put also Orange.data.Table on query_job but it is not working. How can I load data directly from Python to orange?

Transform .xlsx in BLOB storage to .csv using pandas without downloading to local machine

I'm dealing with a transformation from .xlsx file to .csv. I tested locally a python script that downloads .xlsx files from a container in blob storage, manipulate data, save results as .csv file (using pandas) and upload it on a new container. Now I should bring the python script to ADF to build a pipeline to automate the task. I'm dealing with two kind of problems:
First problem: I can't figure out how to complete the task without downloading the file on my local machine.
I found these threads/tutorials but the "azure" v5.0.0 meta-package is deprecated
read excel files from "input" blob storage container and export to csv in "output" container with python
Tutorial: Run Python scripts through Azure Data Factory using Azure Batch
Sofar my code is:
import os
import sys
import pandas as pd
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, PublicAccess
# Create the BlobServiceClient that is used to call the Blob service for the storage account
conn_str = 'XXXX;EndpointSuffix=core.windows.net'
blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)
container_name = "input"
blob_name = "prova/excel/AAA_prova1.xlsx"
container = ContainerClient.from_connection_string(conn_str=conn_str, container_name=container_name)
downloaded_blob = container.download_blob(blob_name)
df = pd.read_excel(downloaded_blob.content_as_bytes(), skiprows = 4)
data = df.to_csv (r'C:\mypath/AAA_prova2.csv' ,encoding='utf-8-sig', index=False)
full_path_to_file = r'C:\mypath/AAA_prova2.csv'
local_file_name = 'prova\csv\AAA_prova2.csv'
#upload in blob
blob_client = blob_service_client.get_blob_client(
container=container_name, blob=local_file_name)
with open(full_path_to_file, "rb") as data:
blob_client.upload_blob(data)
Second problem: with this method I can deal only with the specific name of the blob, but in the future I'll have to parametrize the script (i.e. select only blob names starting with AAA_). I can't understand if I have to deal with this in the python script or if I can manage to filter the file through ADF (i.e. adding a Filter File task before running the python script). I can't find any tutorial/code snippet and any help or hint or documentation would be very appreciated.
EDIT
I modified the code to avoid to download to local machine, now it works (problem #1 solved)
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from io import BytesIO
import pandas as pd
filename = "excel/prova.xlsx"
container_name="input"
blob_service_client = BlobServiceClient.from_connection_string("XXXX==;EndpointSuffix=core.windows.net")
container_client=blob_service_client.get_container_client(container_name)
blob_client = container_client.get_blob_client(filename)
streamdownloader=blob_client.download_blob()
stream = BytesIO()
streamdownloader.download_to_stream(stream)
df = pd.read_excel(stream, skiprows = 5)
local_file_name_out = "csv/prova.csv"
container_name_out = "input"
blob_client = blob_service_client.get_blob_client(
container=container_name_out, blob=local_file_name_out)
blob_client.upload_blob(df.to_csv(path_or_buf = None , encoding='utf-8-sig', index=False))
Azure Functions, Python 3.8 Version of an Azure function. Waits for a blob trigger from Excel. Then does some stuff and used a good chunk of your code for final completion.
Note the split to trim off the .xlsx of the file name.
This is what I ended up with:
source_blob = (f"https://{account_name}.blob.core.windows.net/{uploadedxlsx.name}")
file_name = uploadedxlsx.name.split("/")[2]
container_name = "container"
container_client=blob_service_client.get_container_client(container_name)
blob_client = container_client.get_blob_client(f"Received/{file_name}")
streamdownloader=blob_client.download_blob()
stream = BytesIO()
streamdownloader.download_to_stream(stream)
df = pd.read_excel(stream)
file_name_t = file_name.split(".")[0]
local_file_name_out = f"Converted/{file_name_t}.csv"
container_name_out = "out_container"
blob_client = blob_service_client.get_blob_client(
container=container_name_out, blob=local_file_name_out)
blob_client.upload_blob(df.to_csv(path_or_buf = None , encoding='utf-8-sig', index=False))

CSV to .GZ using Cloud function, Python

I've been trying to compress my CSV files to .gz before uploading to GCS using Cloud Function-Python 3.7, but what my code does only adds the .gz extension but doesn't really compress the file, so in the end, the file was corrupted. Can you please show me how to fix this? Thanks
here is part of my code
import gzip
def to_gcs(request):
job_config = bigquery.QueryJobConfig()
gcs_filename = 'filename_{}.csv'
bucket_name = 'bucket_gcs_name'
subfolder = 'subfolder_name'
client = bigquery.Client()
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
QUERY = "SELECT * FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` session, UNNEST(hits) AS hits"
query_job = client.query(
QUERY,
location='US',
job_config=job_config)
while not query_job.done():
time.sleep(1)
rows_df = query_job.result().to_dataframe()
storage_client = storage.Client()
storage_client.get_bucket(bucket_name).blob(subfolder+'/'+gcs_filename+'.gz').upload_from_string(rows_df.to_csv(sep='|',index=False,encoding='utf-8',compression='gzip'), content_type='application/octet-stream')
As suggested in the thread referred by #Sam Mason in a comment, once you have obtained the Pandas datafame, you should use a TextIOWrapper() and BytesIO() as described in the following sample:
The following sample was inspired by #ramhiser's answer in this SO thread
df = query_job.result().to_dataframe()
blob = bucket.blob(f'{subfolder}/{gcs_filename}.gz')
with BytesIO() as gz_buffer:
with gzip.GzipFile(mode='w', fileobj=gz_buffer) as gz_file:
df.to_csv(TextIOWrapper(gz_file, 'utf8'), index=False)
blob.upload_from_file(gz_buffer,
content_type='application/octet-stream')
also note that if you expect this file to ever get larger than a couple of MB you are probably better off using something from the tempfile module in place of BytesIO. SpooledTemporaryFile is basically designed for this use case, where it will use a memory buffer up to some given size and only use the disk if the file gets really big
Hi I tried to reproduce your use case:
I created a cloud function using this quickstart link:
def hello_world(request):
from google.cloud import bigquery
from google.cloud import storage
import pandas as pd
client = bigquery.Client()
storage_client = storage.Client()
path = '/tmp/file.gz'
query_job = client.query("""
SELECT
CONCAT(
'https://stackoverflow.com/questions/',
CAST(id as STRING)) as url,
view_count
FROM `bigquery-public-data.stackoverflow.posts_questions`
WHERE tags like '%google-bigquery%'
ORDER BY view_count DESC
LIMIT 10""")
results = query_job.result().to_dataframe()
results.to_csv(path,sep='|',index=False,encoding='utf-8',compression='gzip')
bucket = storage_client.get_bucket('mybucket')
blob = bucket.blob('file.gz')
blob.upload_from_filename(path)
This is the requirements.txt:
# Function dependencies, for example:
google-cloud-bigquery
google-cloud-storage
pandas
I deployed the function.
I checked the output.
gsutil cp gs://mybucket/file.gz file.gz
gzip -d file.gz
cat file
#url|view_count
https://stackoverflow.com/questions/22879669|52306
https://stackoverflow.com/questions/13530967|46073
https://stackoverflow.com/questions/35159967|45991
https://stackoverflow.com/questions/10604135|45238
https://stackoverflow.com/questions/16609219|37758
https://stackoverflow.com/questions/11647201|32963
https://stackoverflow.com/questions/13221978|32507
https://stackoverflow.com/questions/27060396|31630
https://stackoverflow.com/questions/6607552|31487
https://stackoverflow.com/questions/11057219|29069

Read in azure blob using python

I want to read an excel file stored in Azure blob storage to a python data frame. What method would I use?
There is a function named read_excel in the pandas package, which you can pass a url of an online excel file to the function to get the dataframe of the excel table, as the figure below.
So you just need to generate a url of a excel blob with sas token and then to pass it to the function.
Here is my sample code. Note: it requires to install Python packages azure-storage, pandas and xlrd.
# Generate a url of excel blob with sas token
from azure.storage.blob.baseblobservice import BaseBlobService
from azure.storage.blob import BlobPermissions
from datetime import datetime, timedelta
account_name = '<your storage account name>'
account_key = '<your storage key>'
container_name = '<your container name>'
blob_name = '<your excel blob>'
blob_service = BaseBlobService(
account_name=account_name,
account_key=account_key
)
sas_token = blob_service.generate_blob_shared_access_signature(container_name, blob_name, permission=BlobPermissions.READ, expiry=datetime.utcnow() + timedelta(hours=1))
blob_url_with_sas = blob_service.make_blob_url(container_name, blob_name, sas_token=sas_token)
# pass the blob url with sas to function `read_excel`
import pandas as pd
df = pd.read_excel(blob_url_with_sas)
print(df)
I used my sample excel file to test the code below, it works fine.
Fig 1. My sample excel file testing.xlsx in my test container of Azure Blob Storage
Fig 2. The content of my sample excel file testing.xlsx
Fig 3. The result of my sample Python code to read excel blob

Categories

Resources