How to open a BLOB file from MySQL using Python without saving?

How to open a BLOB file from MySQL using Python without saving? - python

Code:
import mysql.connector
import sys
def write_file(data, filename):
with open(filename, 'wb') as f:
f.write(data)
sampleNum = 0;
db_config = mysql.connector.connect(user='root', password='test',
host='localhost',
database='technical')
# query blob data form the authors table
cursor = db_config.cursor()
try:
sampleNum=sampleNum+1;
query = "SELECT fileAttachment FROM document_control WHERE id=%s"
cursor.execute(query,(sampleNum,))
file = cursor.fetchone()[0]
write_file(file, 'User'+str(sampleNum)+'.docx')
except AttributeError as e:
print(e)
finally:
cursor.close()
What it does
The above code - gets the file from MySQL stored as a BLOB and it saves me a .docx file into a folder.
Question
But instead of saving it, view it then delete it. Am I able to simply open the BLOB in word without saving it?
If so, how can it be done?

In general, passing binary data like a BLOB entity as a file-like object can be done with the built-in module io, for example:
import io
f = io.BytesIO(data)
# f now can be used anywhere a file-object is expected
But your question actually comes more down to MS Word's ability to open files that aren't saved anywhere on the disk. I don't think it can do that. Best practice would probably be to generate a temporary file using tempfile, so that you can at least expect the system to clean it up eventually:
import tempfile
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f:
f.write(data)
print(f.name)
Edit:
In your code in particular, you could try the following to store the data in a temporary file and automatically open it in MS Word:
import tempfile, subprocess
WINWORD_PATH = r'C:\Program Files (x86)\Microsoft Office\Office14\winword.exe'
def open_as_temp_docx(data):
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f:
f.write(data)
subprocess.Popen([WINWORD_PATH, f.name])
cursor = db_config.cursor()
try:
sampleNum=sampleNum+1;
query = "SELECT fileAttachment FROM document_control WHERE id=%s"
cursor.execute(query,(sampleNum,))
open_as_temp_docx(cursor.fetchone()[0])
I don't have a Windows machine with MS Word at hand, so I can't test this. The path to winword.exe on your machine may vary, so make sure it is correct.
Edit:
If it is important to delete the file as soon as MS Word closes, the following should work:
import tempfile, subprocess, os
WINWORD_PATH = r'C:\Program Files (x86)\Microsoft Office\Office14\winword.exe'
def open_as_temp_docx(data):
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f:
f.write(data)
subprocess.Popen([WINWORD_PATH, f.name]).wait()
if os.path.exists(f.name):
os.unlink(f.name)

Related

How to generalise the import script

I have a query to generate a CSV file from the data in a Postgres Table.The script is working fine.
But i have a situation where i need to create separate files using the data from a different table.
So basically only the below hardcoded one change and rest code is same.Now the situation is i have to create separate scripts for all CSV's.
Is there a way i can have one script and only change this parameters.
I'm using Jenkins to automate the CSV file creation.
filePath = '/home/jenkins/data/'
fileName = 'data.csv'
import csv
import os
import psycopg2
from pprint import pprint
from datetime import datetime
from utils.config import Configuration as Config
from utils.postgres_helper import get_connection
from utils.utils import get_global_config
# File path and name.
filePath = '/home/jenkins/data/'
fileName = 'data.csv'
# Database connection variable.
connect = None
# Check if the file path exists.
if os.path.exists(filePath):
try:
# Connect to database.
connect = get_connection(get_global_config(), 'dwh')
except psycopg2.DatabaseError as e:
# Confirm unsuccessful connection and stop program execution.
print("Database connection unsuccessful.")
quit()
# Cursor to execute query.
cursor = connect.cursor()
# SQL to select data from the google feed table.
sqlSelect = "SELECT * FROM data"
try:
# Execute query.
cursor.execute(sqlSelect)
# Fetch the data returned.
results = cursor.fetchall()
# Extract the table headers.
headers = [i[0] for i in cursor.description]
# Open CSV file for writing.
csvFile = csv.writer(open(filePath + fileName, 'w', newline=''),
delimiter=',', lineterminator='\r\n',
quoting=csv.QUOTE_ALL, escapechar='\\')
# Add the headers and data to the CSV file.
csvFile.writerow(headers)
csvFile.writerows(results)
# Message stating export successful.
print("Data export successful.")
print('CSV Path : '+ filePath+fileName)
except psycopg2.DatabaseError as e:
# Message stating export unsuccessful.
print("Data export unsuccessful.")
quit()
finally:
# Close database connection.
connect.close()
else:
# Message stating file path does not exist.
print("File path does not exist.")

Python JSON import to MongoDB from ZIP files

I have a script that gets all of the .zip files from a folder, then one by one, opens the zip file, loads the content of the JSON file inside and imports this to MongoDB.
The error I am getting is the JSON object must be str, bytes or bytearray, not 'TextIOWrapper'
The code is:
import json
import logging
import logging.handlers
import os
from logging.config import fileConfig
from pymongo import MongoClient
def import_json():
try:
client = MongoClient('5.57.62.97', 27017)
db = client['vuln_sets']
coll = db['vulnerabilities']
basepath = os.path.dirname(__file__)
filepath = os.path.abspath(os.path.join(basepath, ".."))
archive_filepath = filepath + '/vuln_files/'
filedir = os.chdir(archive_filepath)
for item in os.listdir(filedir):
if item.endswith('.json'):
file_name = os.path.abspath(item)
fp = open(file_name, 'r')
json_data = json.loads(fp)
for vuln in json_data:
print(vuln)
coll.insert(vuln)
os.remove(file_name)
except Exception as e:
logging.exception(e)
I can get this working to use a single file but not multiple, i.e. to do one file I wrote:
from zipfile import ZipFile
import json
import pymongo
archive = ZipFile("vulners_collections/cve.zip")
archived_file = archive.open(archive.namelist()[0])
archive_content = archived_file.read()
archived_file.close()
connection = pymongo.MongoClient("mongodb://localhost")
db=connection.vulnerability
vuln1 = db.vulnerability_collection
vulners_objects = json.loads(archive_content)
for item in vulners_objects:
vuln1.insert(item)

From my comment above:
I have no experience with glob, but from skimming the doc I get the impression your archive_files is a simple list of file-paths as strings, correct? You can not perform actions like .open on string (thus your error), so try changing your code to this:
...
archive_filepath = filepath + '/vuln_files/'
archive_files = glob.glob(archive_filepath + "/*.zip")
for file in archive_files:
with open(file, "r") as currentFile:
file_content = currentFile.read()
vuln_content = json.loads(file_content)
for item in vuln_content:
coll.insert(item)
...
file is NOT a file object or anything but just a simple string. So you cant perform methods on it that are not supported by string.

You are redefining your iterator by setting it to the result of the namelist method. You need a for loop within the for to go through the contents of the zip file and of course a new iterator variable.

Isn't file.close wrong and the correct call is file.close().
U can use json.load() to load file directly, instead of json.loads()
fp = open(file_name, 'r')
json_data = json.load(fp)
fp.close()

Reading from sqlite3 remote databases

In my server I'm trying to read from a bunch of sqlite3 databases (sent from web clients) and process their data. The db files are in an S3 bucket and I have their url and I can open them in memory.
Now the problem is sqlite3.connect only takes an absolute path string and I can't pass to it a file in memory.
conn=sqlite3.connect() #how to pass file in memory or url
c=conn.cursor()
c.execute('''select * from data;''')
res=c.fetchall()
# other processing with res

SQLite requires database files to be stored on disk (it uses various locks and paging techniques). An in-memory file will not suffice.
I'd create a temporary directory to hold the database file, write it to that directory, then connect to it. The directory gives SQLite the space to write commit logs as well.
To handle all this, a context manager might be helpful:
import os.path
import shutil
import sqlite3
import sys
import tempfile
from contextlib import contextmanager
#contextmanager
def sqlite_database(inmemory_data):
path = tempfile.mkdtemp()
with open(os.path.join(path, 'sqlite.db'), 'wb') as dbfile:
dbfile.write(inmemory_data)
conn = None
try:
conn = sqlite3.connect(os.path.join(path, 'sqlite.db'))
yield conn
finally:
if conn is not None:
conn.close()
try:
shutil.rmtree(path)
except IOError:
sys.stderr.write('Failed to clean up temp dir {}'.format(path))
and use that as:
with sqlite_database(yourdata) as connection:
# query the database
This writes in-memory data to disk, opens a connection, lets you use that connection, and afterwards cleans up after you.

Cannot import all .JSON files into MongoDB

ISSUE RESOLVED: So it turns out that I never actually had an issue in the first place. When I did a count on the number of records to determine how many records I should expect to be imported, blank spaces between .json objects were being added towards the total record count. However, upon importing, only the objects with content were moved. I'll just leave this post here for reference anyway. Thank you to those who contributed regardless.
I have around ~33GB of .JSON files that were retrieved from Twitter's streaming API stored in a local directory. I am trying to import this data into a MongoDB collection. I have made two attempts:
First attempt: read through each file individually (~70 files). This successfully imported 11,171,885/ 22,343,770 documents.
import json
import glob
from pymongo import MongoClient
directory = '/data/twitter/output/*.json'
client = MongoClient("localhost", 27017)
db = client.twitter
collection = db.test
jsonFiles = glob.glob(directory)
for file in jsonFiles:
f = open(file, 'r')
for line in f.read().split("\n"):
if line:
try:
lineJson = json.loads(line)
except (ValueError, KeyError, TypeError) as e:
pass
else:
postid = collection.insert(lineJson)
print 'inserted with id: ' , postid
f.close()
Second attempt: Concatenate each .JSON file into one large file. This successfully import 11,171,879/ 22,343,770 documents.
import json
import os
from pymongo import MongoClient
import sys
client = MongoClient("localhost", 27017)
db = client.tweets
collection = db.test
script_dir = os.path.dirname(__file__)
file_path = os.path.join(script_dir, '/data/twitter/blob/historical-tweets.json')
try:
with open(file_path, 'r') as f:
for line in f.read().split("\n"):
if line:
try:
lineJson = json.loads(line)
except (ValueError, KeyError, TypeError) as e:
pass
else:
postid = collection.insert(lineJson)
print 'inserted with id: ' , postid
f.close()
The python script did not error out and output a traceback, it simply stopped running. Any ideas to what could be causing this? Or any alternative solutions to importing the data more efficiently? Thanks in advance.

You are reading the file one line at the time. Is each line really valid json? If not, json.loads will trace and you are hiding that trace with the pass statement.

Reading appengine backup_info file gives EOFError

I'm trying to inspect my appengine backup files to work out when a data corruption occured. I used gsutil to locate and download the file:
gsutil ls -l gs://my_backup/ > my_backup.txt
gsutil cp gs://my_backup/LongAlphaString.Mymodel.backup_info file://1.backup_info
I then created a small python program, attempting to read the file and parse it using the appengine libraries.
#!/usr/bin/python
APPENGINE_PATH='/Applications/GoogleAppEngineLauncher.app/Contents/Resources/GoogleAppEngine-default.bundle/Contents/Resources/google_appengine/'
ADDITIONAL_LIBS = [
'lib/yaml/lib'
]
import sys
sys.path.append(APPENGINE_PATH)
for l in ADDITIONAL_LIBS:
sys.path.append(APPENGINE_PATH+l)
import logging
from google.appengine.api.files import records
import cStringIO
def parse_backup_info_file(content):
"""Returns entities iterator from a backup_info file content."""
reader = records.RecordsReader(cStringIO.StringIO(content))
version = reader.read()
if version != '1':
raise IOError('Unsupported version')
return (datastore.Entity.FromPb(record) for record in reader)
INPUT_FILE_NAME='1.backup_info'
f=open(INPUT_FILE_NAME, 'rb')
f.seek(0)
content=f.read()
records = parse_backup_info_file(content)
for r in records:
logging.info(r)
f.close()
The code for parse_backup_info_file was copied from
backup_handler.py
When I run the program, I get the following output:
./view_record.py
Traceback (most recent call last):
File "./view_record.py", line 30, in <module>
records = parse_backup_info_file(content)
File "./view_record.py", line 19, in parse_backup_info_file
version = reader.read()
File "/Applications/GoogleAppEngineLauncher.app/Contents/Resources/GoogleAppEngine-default.bundle/Contents/Resources/google_appengine/google/appengine/api/files/records.py", line 335, in read
(chunk, record_type) = self.__try_read_record()
File "/Applications/GoogleAppEngineLauncher.app/Contents/Resources/GoogleAppEngine-default.bundle/Contents/Resources/google_appengine/google/appengine/api/files/records.py", line 307, in __try_read_record
(length, len(data)))
EOFError: Not enough data read. Expected: 24898 but got 2112
I've tried with a half a dozen different backup_info files, and they all show the same error (with different numbers.)
I have noticed that they all have the same expected length: I was reviewing different versions of the same model when I made that observation, it's not true when I view the backup files of other Modules.
EOFError: Not enough data read. Expected: 24932 but got 911
EOFError: Not enough data read. Expected: 25409 but got 2220
Is there anything obviously wrong with my approach?
I guess the other option is that the appengine backup utility is not creating valid backup files.
Anything else you can suggest would be very welcome.
Thanks in Advance

There are multiple metadata files created when an AppEngine Datastore backup is run:
LongAlphaString.backup_info is created once. This contains metadata about all of the entity types and backup files that were created in datastore backup.
LongAlphaString.[EntityType].backup_info is created once per entity type. This contains metadata about the the specific backup files created for [EntityType] along with schema information for the [EntityType].
Your code works for interrogating the file contents of LongAlphaString.backup_info, however it seems that you are trying to interrogate the file contents of LongAlphaString.[EntityType].backup_info. Here's a script that will print the contents in a human-readable format for each file type:
import cStringIO
import os
import sys
sys.path.append('/usr/local/google_appengine')
from google.appengine.api import datastore
from google.appengine.api.files import records
from google.appengine.ext.datastore_admin import backup_pb2
ALL_BACKUP_INFO = 'long_string.backup_info'
ENTITY_KINDS = ['long_string.entity_kind.backup_info']
def parse_backup_info_file(content):
"""Returns entities iterator from a backup_info file content."""
reader = records.RecordsReader(cStringIO.StringIO(content))
version = reader.read()
if version != '1':
raise IOError('Unsupported version')
return (datastore.Entity.FromPb(record) for record in reader)
print "*****" + ALL_BACKUP_INFO + "*****"
with open(ALL_BACKUP_INFO, 'r') as myfile:
parsed = parse_backup_info_file(myfile.read())
for record in parsed:
print record
for entity_kind in ENTITY_KINDS:
print os.linesep + "*****" + entity_kind + "*****"
with open(entity_kind, 'r') as myfile:
backup = backup_pb2.Backup()
backup.ParseFromString(myfile.read())
print backup

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to open a BLOB file from MySQL using Python without saving? - python

Related

How to generalise the import script

Python JSON import to MongoDB from ZIP files

Reading from sqlite3 remote databases

Cannot import all .JSON files into MongoDB

Reading appengine backup_info file gives EOFError

Categories

Resources