Processing chm files

Processing chm files - python

Is there a Python library I can use for processing files with chm extension that has similar features as HTML parser or BeautifulSoup?

PyCHM:
http://gnochm.sourceforge.net/pychm.html

I've struggled with PyCHM to create simple thumbnailer extracting cover images from .chm files. Here is the code for all those who find this question in the future:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import chm.chm as chm
from bs4 import BeautifulSoup
from PIL import Image
import urlparse
try:
from cStringIO import StringIO
except:
from StringIO import StringIO
class CHMFile:
def __init__(self, file_name):
self.chmfile = chm.CHMFile()
self.chmfile.LoadCHM(file_name)
def create_thumb(self, out_file):
image = None
area = 0 # cover will propably be the biggest image from home page
iui = self.chmfile.ResolveObject(self.chmfile.home)
home = self.chmfile.RetrieveObject(iui[1])[1] # get home page (as html)
tree = BeautifulSoup(home)
for img in tree.find_all('img'):
src_attr = urlparse.urljoin(self.chmfile.home, img.get('src'))
chm_image = self.chmfile.ResolveObject(src_attr)
png_data = self.chmfile.RetrieveObject(chm_image[1])[1] # get image (as raw data)
png_img = Image.open(StringIO(png_data))
new_width, new_height = png_img.size
new_area = new_width * new_height
if(new_area > area and new_width > 50 and new_height > 50): # to ensure image is at least 50x50
area = new_area
image = png_img
if image:
image.save(out_file, format="PNG")
if __name__ == '__main__':
import sys
if len(sys.argv) != 3:
print 'Create thumbnail image from an chm file'
print 'Usage: %s INFILE OUTFILE' % sys.argv[0]
else:
chm = CHMFile(sys.argv[1])
chm.create_thumb(sys.argv[2])

Related

python code for downloading images from image-net.org for haar cascade training

I have a python code for downloading images from "www.image-net.org" for haar cascade training. Basically it checks each image urls and download the images.
import urllib2
import cv2
import numpy as np
import os
import urllib
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def store_raw_images():
pos_images_link = 'http://www.image-net.org/api/text/imagenet.synset.geturls?wnid=n04154340'
pos_image_urls = urllib2.urlopen(pos_images_link).read().decode()
if not os.path.exists('pos'):
os.makedirs('pos')
pic_num = 1
for i in pos_image_urls.split('\n'):
try:
print(i)
urllib.urlretrieve(i, "pos/"+str(pic_num)+".jpg")
img = cv2.imread("pos/"+str(pic_num)+".jpg",cv2.IMREAD_GRAYSCALE)
# should be larger than samples / pos pic (so we can place our image on it)
resized_image = cv2.resize(img, (100, 100))
cv2.imwrite("pos/"+str(pic_num)+".jpg",resized_image)
pic_num += 1
except Exception as e:
print(str(e))
store_raw_images()
I copy paste the url link to download in "pos_images_link", but the code only checks the urls of 5 images then the code stops running with a message in the terminal:
"terminate called after throwing an instance of 'std::out_of_range'
what(): basic_string::substr: __pos (which is 140) > this->size() (which is 0)"
, i am using opencv 3.1.0 and python 2.7.12

The follows worked in python 3 with opencv
from urllib.request import Request, urlretrieve
import cv2
import numpy as np
import os
import urllib
import sys
def store_raw_images():
url = 'http://www.image-net.org/api/text/imagenet.synset.geturls?wnid=n04154340'
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
urls = response.read().decode('utf-8')
if not os.path.exists('pos'):
os.makedirs('pos')
pic_num = 1
for i in urls.split('\n'):
try:
print(i)
urlretrieve(i, "pos/"+str(pic_num)+".jpg")
img = cv2.imread("pos/"+str(pic_num)+".jpg",cv2.IMREAD_GRAYSCALE)
# should be larger than samples / pos pic (so we can place our image on it)
resized_image = cv2.resize(img, (100, 100))
cv2.imwrite("pos/"+str(pic_num)+".jpg",resized_image)
pic_num += 1
except Exception as e:
print(str(e))
store_raw_images()

How to convert image format between PIL opencv web and mongoDB?

I want to store image from web to MongoDB, but first I will check the image by opencv to make sure it is an Blood Test Report image, just like followiing snippet:
if 'imagefile' not in request.files:
abort(400)
imgfile = request.files['imagefile']
if imgfile.filename == '':
abort(400)
if imgfile:
#pil = StringIO(imgfile)
#pil = Image.open(pil)
img = cv2.imdecode(numpy.fromstring(imgfile.read(), numpy.uint8), cv2.CV_LOAD_IMAGE_UNCHANGED)
filtered = ImageFilter(image=img).filter()
if filtered is None:
return jsonify({"error": "please make sure your picture is perfect"})
# save to mongo
content = StringIO()
filtered.save(content, format="JPEG")
fid, filename= save_file(content,imgfile.name)
The ImageFilter accepts a opencv format image, and do something such as filtering, and then return a PIL image, and it succeed! then I save the PIL image to MongoDB, code like this:
def save_file(content, name):
# content = StringIO(f.read())
try:
mime = Image.open(content).format.lower()
if mime not in app.config['ALLOWED_EXTENSIONS']:
raise IOError()
except IOError:
abort(400)
c = dict(content=bson.binary.Binary(content.getvalue()),
filename=secure_filename(name), mime=mime)
db.files.save(c)
return c['_id'], c['filename']
And it succeed! Then i have another function to find a image by id from MongoDB, then I will use it to do OCR.
def get_report(fid):
try:
file = db.files.find_one(bson.objectid.ObjectId(fid))
if file is None:
raise bson.errors.InvalidId()
print(type(file['content']))
img = cv2.imdecode(numpy.fromstring(dumps(file['content']), numpy.uint8), cv2.CV_LOAD_IMAGE_UNCHANGED)
if img is None:
return jsonify({"error": "please make sure your picture is perfect"})
report_data = ImageFilter(image=img).ocr(22)
print report_data
if report_data is None:
return jsonify({"error": "can't ocr'"})
return jsonify(report_data)
except bson.errors.InvalidId:
flask.abort(404)
Again, I will use it in opencv format, so I will convert the bson.binary.Binary to opencv image, but it failed! because img always none by
img = cv2.imdecode(numpy.fromstring(dumps(file['content']), numpy.uint8), cv2.CV_LOAD_IMAGE_UNCHANGED)
So, my last question is what is the real image format in python, How i convert it in web mongodb opencv pil and memory!,Following is one method i tried, but it failed! I want to convert the Binary to PIL image use Image.frombytes first, then i convert the PIL to opencv. But error:ValueError: not enough image data
# -*- coding: utf-8 -*-
import os
from pymongo import MongoClient
import bson
from PIL import Image
from imageFilter import ImageFilter
import cv2
import numpy
from bson.json_util import dumps
db = MongoClient('localhost', 27017).test
file =db.files.find_one(bson.objectid.ObjectId("58454666a235ec451d3bf2e6"))
if file is None:
raise bson.errors.InvalidId()
print(type(file['content']))
# this is success, I use Flask Response
#return Response(file['content'], mimetype='image/' + file['mime'])
# file['content']是整个图片文件的二进制对象，也就是说是一个文件，不应该直接作为二进制数据传递给Image
Image.frombytes(mode='RGB',size=(1000,760),data=file['content'])
img = cv2.imdecode(numpy.fromstring(dumps(file['content']), numpy.uint8), cv2.CV_LOAD_IMAGE_UNCHANGED)
if img is None:
print "img is None"
# ImageFilter accept opencv img to process it by opencv
report_data = ImageFilter(image=img).ocr(22)
print report_data

Using def function to call image from urllib import urlopen - python 2.7

I have made def function to call JPG image to display easily but it in not working property. Also I don't want to use urllib2. It says image_file = image_to_PhotoImage(image)
NameError: name 'image' is not defined
Any suggestions? Thank you
from urllib import urlopen
from Tkinter import *
def image_to_PhotoImage(image, width = None, height = None):
# Import the Python Imaging Library, if it exists
try:
from PIL import Image, ImageTk
except:
raise Exception, 'Python Imaging Library has not been installed properly!'
# Import StringIO for character conversions
from StringIO import StringIO
# Convert the raw bytes into characters
image_chars = StringIO(image)
# Open the character string as a PIL image, if possible
try:
pil_image = Image.open(image_chars)
except:
raise Exception, 'Cannot recognise image given to "image_to_Photoimage" function\n' + \
'Confirm that image was downloaded correctly'
# Resize the image, if a new size has been provided
if type(width) == int and type(height) == int and width > 0 and height > 0:
pil_image = pil_image.resize((width, height), Image.ANTIALIAS)
# Return the result as a Tkinter PhotoImage
return ImageTk.PhotoImage(pil_image)
import Tkinter as tk
root = tk.Tk()
url = "http://www.online-image-editor.com//styles/2014/images/example_image.png"
gogo = urlopen(url)
data_stream = gogo.read()
gogo.close()
image_file = image_to_PhotoImage(image1)
label = tk.Label(root, image=image_file, bg='black')

Windows Python Code not working on Linux Debian

The code generates a QR code and prints it, but It is not working on the Debian Os due to not supporting the imported libraries (win32print, Win32ui).
Can anyone tell me how to run it on the Debian without changing the whole code.
from random import randint
import win32print
import win32ui
from PIL import Image, ImageWin
from PIL._imaging import font
from PIL import ImageFont
from PIL import ImageDraw
HORZRES = 8
VERTRES = 10
LOGPIXELSX = 88
LOGPIXELSY = 90
PHYSICALWIDTH = 110
PHYSICALHEIGHT = 111
PHYSICALOFFSETX = 112
PHYSICALOFFSETY = 113
__author__ = 'masoodhussain'
import qrcode
import subprocess
import os
qr = qrcode.QRCode(
version=1,
error_correction=qrcode.constants.ERROR_CORRECT_L,
box_size=10,
border=4,
)
qr.add_data('Masooddkjfdlfs,kokdfds sddshfhkjshfljsdhkjfdrtyyhtfhfghgh3')
qr.make(fit=True)
"subprocess.call(['lp', 'foo.png'])"
printer_name = win32print.GetDefaultPrinter()
img = qr.make_image()
img.show()
random_number= randint(0,10000)
img.save('label_'+str(random_number)+'.png')
file_name = 'label_'+str(random_number)+'.png'
print(file_name)
hDC = win32ui.CreateDC ()
hDC.CreatePrinterDC (printer_name)
printable_area = hDC.GetDeviceCaps (HORZRES), hDC.GetDeviceCaps (VERTRES)
printer_size = hDC.GetDeviceCaps (PHYSICALWIDTH), hDC.GetDeviceCaps (PHYSICALHEIGHT)
printer_margins = hDC.GetDeviceCaps (PHYSICALOFFSETX), hDC.GetDeviceCaps (PHYSICALOFFSETY)
bmp = Image.open (file_name)
if bmp.size[0] > bmp.size[1]:
bmp = bmp.rotate (90)
ratios = [1.0 * printable_area[0] / bmp.size[0], 1.0 * printable_area[1] / bmp.size[1]]
scale = min (ratios)
hDC.StartDoc (file_name)
hDC.StartPage ()
dib = ImageWin.Dib (bmp)
scaled_width, scaled_height = [int (scale * i) for i in bmp.size]
x1 = int ((printer_size[0] - scaled_width) / 2)
y1 = int ((printer_size[1] - scaled_height) / 2)
x2 = x1 + scaled_width
y2 = y1 + scaled_height
dib.draw (hDC.GetHandleOutput (), (x1, y1, x2, y2))
hDC.EndPage ()
hDC.EndDoc ()
hDC.DeleteDC ()
when I run the code by removing the unsupported libraries it gives an error on this part: error importing
import qrcode
I am trying to import whole folder for using there other files. In Windows it was working perfectly. Any help would be appreciated.Thanks

This code is equivalent to the code posted in Question.
from random import randint
import cups
from PIL import Image, ImageWin
from PIL._imaging import font
from PIL import ImageFont
from PIL import ImageDraw
__author__ = 'masoodhussain'
import qrcode
qr = qrcode.QRCode(
version=1,
error_correction=qrcode.constants.ERROR_CORRECT_L,
box_size=5,
border=2,
)
qr.add_data('localhost:5070productinfo')
qr.make(fit=True)
conn= cups.Connection()
printer_name = conn.getPrinters()
printer_name = printer_name.keys()[0]
printqueuelength = len(conn.getJobs())
img = qr.make_image()
img.show()
random_number= randint(0,10000)
img.save('label_'+str(random_number)+'.png')
file_name = 'label_'+str(random_number)+'.png'
print(file_name)
conn.printFile(printer_name,file_name,"Hello", options ={'media':'25x25mm'})
Important part is the installation of required libraries and changing your media to the required size.

Even if you install qrcode, your code will still fail because of the Windows specific library. You need to check on which system you are working and preferably put the whole print function in a separate function.
Here are some useful links: https://stackoverflow.com/a/1857/2776376 and https://pypi.python.org/pypi/pycups
import platform
if platform.system() = 'Linux':
import libcups
elif platform.system() = 'Windows':
import win32print
import win32ui
else:
print('Unsupported OS. Exiting....')
sys.exit(1)
def my_printer_function():
if platform.system() = 'Linux':
#now call the Linux printer
elif platform.system() = 'Windows':
#use your old Windows code

img = Image.open(fp) AttributeError: class Image has no attribute 'open'

I want to put the pictures into a PDF file. My code follows...
import sys
import xlrd
from PIL import Image
import ImageEnhance
from reportlab.platypus import *
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.rl_config import defaultPageSize
PAGE_HEIGHT=defaultPageSize[1]
styles = getSampleStyleSheet()
Title = "Integrating Diverse Data Sources with Gadfly 2"
Author = "Aaron Watters"
URL = "http://www.chordate.com/"
email = "arw#ifu.net"
from reportlab.lib.units import inch
pageinfo = "%s / %s / %s" % (Author, email, Title)
def myFirstPage(canvas, doc):
canvas.saveState()
#canvas.setStrokeColorRGB(1,0,0)
#canvas.setLineWidth(5)
#canvas.line(66,72,66,PAGE_HEIGHT-72)
canvas.setFont('Times-Bold',16)
canvas.drawString(108, PAGE_HEIGHT-108, Title)
canvas.setFont('Times-Roman',9)
canvas.drawString(inch, 0.75 * inch, "First Page / %s" % pageinfo)
canvas.restoreState()
def myLaterPages(canvas, doc):
#canvas.drawImage("snkanim.gif", 36, 36)
canvas.saveState()
#canvas.setStrokeColorRGB(1,0,0)
#canvas.setLineWidth(5)
#canvas.line(66,72,66,PAGE_HEIGHT-72)
canvas.setFont('Times-Roman',9)
canvas.drawString(inch, 0.75 * inch, "Page %d %s" % (doc.page, pageinfo))
canvas.restoreState()
def go():
Elements.insert(0,Spacer(0,inch))
doc = SimpleDocTemplate('ss.pdf')
doc.build(Elements,onFirstPage=myFirstPage, onLaterPages=myLaterPages)
Elements = []
HeaderStyle = styles["Heading1"] # XXXX
def header(txt, style=HeaderStyle, klass=Paragraph, sep=0.3):
s = Spacer(0.2*inch, sep*inch)
Elements.append(s)
para = klass(txt, style)
Elements.append(para)
ParaStyle = styles["Normal"]
def p(txt):
return header(txt, style=ParaStyle, sep=0.1)
def open_excel(file= 'exc.xls'):
try:
data = xlrd.open_workbook(file)
return data
except Exception,e:
print str(e)
#pre = p # XXX
PreStyle = styles["Code"]
def pre(txt):
s = Spacer(0.1*inch, 0.1*inch)
Elements.append(s)
p = Preformatted(txt, PreStyle)
Elements.append(p)
p("""\
Relational databases manipulate and store persistent
table structures called relations, such as the following
three tables""")
fp = open("/pdf-ex/downloadwin7.png","rb")
img = Image.open(fp)
img.show()
# HACK
Elements.append(PageBreak())
go()

You have a namespace conflict. One of your import statements is masking PIL.Image (which is a module, not a class) with some class named Image.
Instead of ...
from PIL import Image
try ...
import PIL.Image
then later in your code...
fp = open("/pdf-ex/downloadwin7.png","rb")
img = PIL.Image.open(fp)
img.show()
When working with a LOT of imports, beware of namespace conflicts. I'm generally very wary of from some_module import * statements.
Good luck with your project and happy coding.

I had a similar problem with TKInter in a single file:
I changed:
from PIL import ImageTk, Image
from tkinter import *
to:
from tkinter import *
from PIL import ImageTk, Image
and the problem went away.

This is the only solution I could find.
try:
from PIL import Image
except ImportError:
import Image

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Processing chm files - python

Is there a Python library I can use for processing files with chm extension that has similar features as HTML parser or BeautifulSoup?

PyCHM: http://gnochm.sourceforge.net/pychm.html

Related

python code for downloading images from image-net.org for haar cascade training

How to convert image format between PIL opencv web and mongoDB?

Using def function to call image from urllib import urlopen - python 2.7

Windows Python Code not working on Linux Debian

img = Image.open(fp) AttributeError: class Image has no attribute 'open'

Categories

Resources