Upload pdf file with gdata docs python v3.0 with ocr - python

I've got the following implementation for uploading a pdf file to google docs (taken from the gdata API samples):
def UploadResourceSample():
"""Upload a document, and convert to Google Docs."""
client = CreateClient()
doc = gdata.docs.data.Resource(type='document', title='My Sample Doc')
# This is a convenient MS Word doc that we know exists
path = _GetDataFilePath('test.0.doc')
print 'Selected file at: %s' % path
# Create a MediaSource, pointing to the file
media = gdata.data.MediaSource()
media.SetFileHandle(path, 'application/msword')
# Pass the MediaSource when creating the new Resource
doc = client.CreateResource(doc, media=media)
print 'Created, and uploaded:', doc.title.text, doc.resource_id.text
Now I would like to perform OCR text recognition on the uploaded file. But I'm not sure how to enable the OCR recognition in gdata docs python API. So my question is:
Is there a way to enable OCR recognition using gdata python v3.0 API on a pdf file?

I've managed to get my pdf document OCR'ed using the following code:
def UploadResourceSample(filename, filepath, fullpath):
"""Upload a document, and convert to Google Docs."""
client = CreateClient()
doc = gdata.docs.data.Resource(type='document', title=filename)
path = fullpath
print 'Selected file at: %s' % path
# Create a MediaSource, pointing to the file
media = gdata.data.MediaSource()
media.SetFileHandle(path, 'application/pdf')
# Pass the MediaSource when creating the new Resource
create_uri = gdata.docs.client.RESOURCE_UPLOAD_URI + '?ocr=true&ocr-language=de'
doc = client.CreateResource(doc, create_uri=create_uri, media=media)
print 'Created, and uploaded:', doc.title.text, doc.resource_id.text

Related

hug create GET endpoint to download created word document

I'm working with Python hug API would like to create a GET API for the frontend. The frontend can download a created word document file e.g. via download button. However, after going through a documentation, I still cannot figure out a way to do it.
Here is my working script so far:
import os
import hug
from docx import Document
#hug.get("/download_submission_document")
def download_submission_document():
file_name = 'example.docx'
document = Document()
document.add_heading('Test header', level=2)
document.add_paragraph('Test paragraph')
document.save(file_name)
# TO DO: send a created file to frontend
I'm not sure if we can send the object right away or we have to save it first somewhere before sending the the frontend. (requirements: hug, python-docx)
I'm trying to use something like
#hug.get("/download_submission_document", output=hug.output_format.file)
but not sure how to return a file.
Alright, I found a solution which is easier than I thought. Just do the following:
#hug.get("/download_submission_document", output=hug.output_format.file)
def download_submission_document():
file_name = 'example.docx'
document = Document()
document.add_heading('Test header', level=2)
document.add_paragraph('Test paragraph')
document.save(file_name)
return file_name
Return file_name already download the docx

Why is Google Cloud Vision api unable to detect text in a particular pdf file although it works fine on a very similar pdf?

The google cloud vision api works fine on one pdf pdf1 but returns absolutely nothing on the other pdf pdf2. I'm unable to make sense of this behavior as both the pdfs are very similar and have almost the same font.Please help.
I'm using the code given in their examples section by uploading these files in a google cloud bucket.
def async_detect_document(gcs_source_uri, gcs_destination_uri):
"""OCR with PDF/TIFF as source files on GCS"""
from google.cloud import vision
from google.cloud import storage
from google.protobuf import json_format
# Supported mime_types are: 'application/pdf' and 'image/tiff'
mime_type = 'application/pdf'
# How many pages should be grouped into each json output file.
batch_size = 2
client = vision.ImageAnnotatorClient()
feature = vision.types.Feature(
type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
gcs_source = vision.types.GcsSource(uri=gcs_source_uri)
input_config = vision.types.InputConfig(
gcs_source=gcs_source, mime_type=mime_type)
gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)
output_config = vision.types.OutputConfig(
gcs_destination=gcs_destination, batch_size=batch_size)
async_request = vision.types.AsyncAnnotateFileRequest(
features=[feature], input_config=input_config,
output_config=output_config)
operation = client.async_batch_annotate_files(
requests=[async_request])
print('Waiting for the operation to finish.')
operation.result(timeout=180)
# Once the request has completed and the output has been
# written to GCS, we can list all the output files.
storage_client = storage.Client()
match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
bucket_name = match.group(1)
prefix = match.group(2)
bucket = storage_client.get_bucket(bucket_name=bucket_name)
# List objects with the given prefix.
blob_list = list(bucket.list_blobs(prefix=prefix))
print('Output files:')
for blob in blob_list:
print(blob.name)
# Process the first output file from GCS.
# Since we specified batch_size=2, the first response contains
# the first two pages of the input file.
output = blob_list[0]
json_string = output.download_as_string()
response = json_format.Parse(
json_string, vision.types.AnnotateFileResponse())
# The actual response for the first page of the input file.
first_page_response = response.responses[0]
annotation = first_page_response.full_text_annotation
# Here we print the full text from the first page.
# The response contains more information:
# annotation/pages/blocks/paragraphs/words/symbols
# including confidence scores and bounding boxes
print(u'Full text:\n{}'.format(
annotation.text))
It probably has nothing to do with the GCloud API, I tried uploading your pdf to the vision drag and drop website and it returns expected results. Maybe at some point in your pipeline, the pdf is corrupted in any way? what does it look like in gcloud storage?
we also faced this issue and after doing few experiments it seems to me this is due to some font google vision not able to support.
To solve this generate pdf to image and then send image to process will provide result.

Is there any cloud API to save internet image to cloud using Python?

I have a project in hand to backup a website for some reasons. I use Requests in Python to crawl the contents and images (urls). The problem is, how can I save the image in the cloud, by using the url of that image, in Cloud services (Google Drive, Dropbox, etc.).
I know there is a way to first save the image locally and then upload the local image to the cloud. But I'm wondering if there are APIs that support uploading images by urls, not the local file.
It seems like Dropbox has a feature called /save_url that
lets app developers upload files to Dropbox by just providing a URL, without having to download the file first.
https://www.dropbox.com/developers-v1/core/docs#save-url
If you don't mind paying for the storage, you can save it to your own cloud storage. I occasionally have to do a similar action, and handle it as such:
def on_upload_image(self):
url = self.request.get('url')
result = urlfetch.fetch(url)
binary = result.content
blob_key = functions.get_blob_key_by_data(binary)
self.url = images.get_serving_url(blob_key, secure_url=True)
self.json()
from google.appengine.api import app_identity
def get_blob_key_by_data(data):
bucket = app_identity.get_default_gcs_bucket_name()
filename = hashlib.sha256(data).hexdigest()
mime_type = get_mime_type(data)
if not mime_type:
return None
gcs_filename = '/%s/image_%s' % (bucket, filename)
with gcs.open(gcs_filename, 'w', content_type=mime_type) as f:
f.write(data)
blob_key = blobstore.create_gs_key("/gs" + gcs_filename)
return blob_key

Uploading an Image from an external link to google cloud storage using google app engine python

I'm looking for a solution on how to upload a picture from an external url like http://example.com/image.jpg to google cloud storage using appengine python,
I am now using
blobstore.create_upload_url('/uploadSuccess', gs_bucket_name=bucketPath)
for users that want to upload a picture from their computer, calling
images.get_serving_url(gsk,size=180,crop=True)
on uploadSuccess and storing that as their profile image. I'm trying to allow users to use their facebook or google profile picture after they login with oauth2. I have access to their profile picture link, and I would just like to copy it for consistency. Pease help :)
To upload an external image you have to get it and save it.
To get the image you van use this code:
from google.appengine.api import urlfetch
file_name = 'image.jpg'
url = 'http://example.com/%s' % file_name
result = urlfetch.fetch(url)
if result.status_code == 200:
doSomethingWithResult(result.content)
To save the image you can use the app engine GCS client code shown here
import cloudstorage as gcs
import mimetypes
doSomethingWithResult(content):
gcs_file_name = '/%s/%s' % ('bucket_name', file_name)
content_type = mimetypes.guess_type(file_name)[0]
with gcs.open(gcs_file_name, 'w', content_type=content_type,
options={b'x-goog-acl': b'public-read'}) as f:
f.write(content)
return images.get_serving_url(blobstore.create_gs_key('/gs' + gcs_file_name))
Here is my new solution (2019) using the google-cloud-storage library and upload_from_string() function only (see here):
from google.cloud import storage
import urllib.request
BUCKET_NAME = "[project_name].appspot.com" # change project_name placeholder to your preferences
BUCKET_FILE_PATH = "path/to/your/images" # change this path
def upload_image_from_url_to_google_storage(img_url, img_name):
"""
Uploads an image from a URL source to google storage.
- img_url: string URL of the image, e.g. https://picsum.photos/200/200
- img_name: string name of the image file to be stored
"""
storage_client = storage.Client()
bucket = storage_client.get_bucket(BUCKET_NAME)
blob = bucket.blob(BUCKET_FILE_PATH + "/" + img_name + ".jpg")
# try to read the image URL
try:
with urllib.request.urlopen(img_url) as response:
# check if URL contains an image
info = response.info()
if(info.get_content_type().startswith("image")):
blob.upload_from_string(response.read(), content_type=info.get_content_type())
print("Uploaded image from: " + img_url)
else:
print("Could not upload image. No image data type in URL")
except Exception:
print('Could not upload image. Generic exception: ' + traceback.format_exc())
If you're looking for an updated way of doing this relying on storages package, I wrote those 2 functions:
import requests
from storages.backends.gcloud import GoogleCloudStorage
def download_file(file_url, file_name):
response = requests.get(file_url)
if response.status_code == 200:
upload_to_gc(response.content, file_name)
def upload_to_gc(content, file_name):
gc_file_name = "{}/{}".format("some_container_name_here", file_name)
with GoogleCloudStorage().open(name=gc_file_name, mode='w') as f:
f.write(content)
Then normally call download_file() and pass url and prefered_file_name from anywhere within your system.
The class GoogleCloudStorage came from django-storages package.
pip install django-storages
Django Storages

Using Google App Engine how to upload document in google docs (python)

I want to upload document, file to google docs using Google Apps Engine (python)
any code or link will be appreciated
See the documentation, but you might try something like:
ms = gdata.MediaSource(file_path='/path/to/your/test.doc', content_type=gdata.docs.service.SUPPORTED_FILETYPES['DOC'])
entry = gd_client.Upload(ms, 'MyDocTitle')
print 'Document now accessible online at:', entry.GetAlternateLink().href
Solution is with files Upload, You need to read data using below line in python:
function to read file size
def getSize(self,fileobject):
fileobject.seek(0,2) # move the cursor to the end of the file
size = fileobject.tell()
return size
f = self.request.POST.get('fname').file
media = gdata.data.MediaSource(file_handle=f.read(), content_type=gdata.docs.service.SUPPORTED_FILETYPES[ext], content_length=self.getSize(self.request.POST.get('fname').file))
And also need to modify the gdata python library of Google to achieve this:
client.py:
in
def upload_file
replace:
while not entry:
entry = self.upload_chunk(start_byte, self.file_handle.read(self.chunk_size))
start_byte += self.chunk_size
With:
while not entry:
entry = self.upload_chunk(start_byte, self.file_handle)
start_byte += self.chunk_size
And you can upload file directory to google doc

Categories

Resources