Python null check import xls

Python null check import xls - python

There is a python code which reads from a field xls file
The script works, but there are problems when there are empty fields in the file
The script does not read the field if the file has an empty field
My code, for example, here the NORD field is empty:
from msexcel8com import *
def convert(dsIn, dsOut):
import sys
sys.setdefaultencoding("utf-8")
import msexcel8com
xlsApp = msexcel8com.Application()
xlsApp.Workbooks.Open(unicode(dsIn["PATH_TO_XLS"]))
xlsWorkbook = xlsApp.Workbooks.Item(1)
xlsWorksheet = xlsWorkbook.Worksheets.Item(1)
xlsWorksheet.Cells.SpecialCells(11, None).Activate()
rowsCount = xlsApp.ActiveCell.Row
import msxml2
dsOut.clear()
outXML = msxml2.DOMDocument()
RootNode = outXML.createElement("MSG")
RootNode.setAttribute("FORMAT", "IMPORT_LN")
ChildNodes = outXML.appendChild(RootNode)
i, k, c = 1, 1, 2
while i < rowsCount:
i = i + 1
if k > c:
k = 0
dsOut.append()
dsOut["XML_OUT"] = unicode.encode(outXML.xml, "utf-8")
outXML = msxml2.DOMDocument()
RootNode = outXML.createElement("MSG")
RootNode.setAttribute("FORMAT", "IMPORT_LN")
ChildNodes = outXML.appendChild(RootNode)
try:
TMPNode = outXML.createElement("CLIENT")
TMPNode.setAttribute("NCODE", xlsWorksheet.Cells.Item(i, 1).Value)
TMPNode.setAttribute("NORD", xlsWorksheet.Cells.Item(i, 2).Value)
ChildNodes.appendChild(TMPNode)
k = k + 1
except Exception as e:
print(e)
dsOut.append()
dsOut["XML_OUT"] = unicode.encode(outXML.xml, "utf-8")
try:
xlsApp.Workbooks.Close()
except Exception as e:
print(e)
try:
xlsApp.Quit()
except Exception as e:
print(e)
How to make sure that even if there is an empty field, return as null and the rest of the values?

I couldn't resist the temptation of writing this without all that Excel automation.
Assuming an Excel file that looks something like this called so59715137.xls:
import xlrd # assuming it's an .xls, not .xlsx
import xml.etree.ElementTree as et
def read_rows(xls_filename, column_labels):
book = xlrd.open_workbook(xls_filename)
sh = book.sheet_by_index(0)
for rx in range(sh.nrows):
yield dict(zip(column_labels, sh.row_values(rx)))
def convert(xls_filename):
xml_root = et.Element("MSG", {"FORMAT": "IMPORT_LN"})
for row in read_rows(xls_filename, ("NCODE", "NORD")):
print(row) # for debugging
if row.get("NCODE") and row.get("NORD"): # both attributes must be truthy
et.SubElement(xml_root, "CLIENT", attrib=row) # just use the dict as attributes
return et.tostring(xml_root, encoding="unicode")
xml_content = convert("so59715137.xls")
print("-------------------------------")
print(xml_content)
# TODO: write to file
outputs (with debugging output included, so you see it reads but elides the rows that are missing data)
{'NCODE': 'foo', 'NORD': 'faa'}
{'NCODE': 'blep', 'NORD': 'blop'}
{'NCODE': 'missing', 'NORD': ''}
{'NCODE': '', 'NORD': 'other-missing'}
-------------------------------
<MSG FORMAT="IMPORT_LN"><CLIENT NCODE="foo" NORD="faa" /><CLIENT NCODE="blep" NORD="blop" /></MSG>
From there on out, it's easy to read/write your dsIn/dsOut structures.

Related

Getting Import/Library issue in my robotframework

Getting Import/Library issues in my robot framework, I've Customlib file where all my custom functions reside while trying to import the Customlib getting an error
[enter image description here][1]
[enter image description here][2]
[1]: https://i.stack.imgur.com/poPzQ.png
[2]: https://i.stack.imgur.com/qkbxK.png
CustomLib Code:
robot is complaining Setup failed: No keyword with name Customlib.get config test data
import os
# Declaring empty list for test data and config file
testData = {}
configTestData = {}
class CustomLib:
ROBOT_LIBRARY_SCOPE = 'Test Case'
# Function for getting data from confi file and test data file altogether
#staticmethod
def get_global_config_data_and_test_data(testdata_filename):
configpath = os.path.dirname(os.path.abspath(__file__))
print(configpath)
configpath1 = configpath.replace("Utils", "")
configpath = configpath1.replace(configpath1, "config.properties")
# configpath=configpath.replace("Utils", "config.properties")
try:
file = open(configpath)
for line in file:
content = line.split("=")
firstArgument = content[0]
secondArgument = content[1]
a = firstArgument.rstrip('\n')
b = secondArgument.rstrip('\n')
testData[a] = b
except Exception as e:
if hasattr(e, 'message'):
print(e.message)
else:
print(e)
finally:
file.close()
return CustomLib.get_testData_From_PropertiesFile(CustomLib.OS_path_fromat_separator(testdata_filename))
# Function for reading test data from property file
#staticmethod
def get_testData_From_PropertiesFile(propfile):
try:
file = open(propfile)
for line in file:
content = line.split("=")
firstArgument = content[0]
secondArgument = content[1]
a = firstArgument.rstrip('\n')
b = secondArgument.rstrip('\n')
testData[a] = b
except Exception as e:
if hasattr(e, 'message'):
print(e.message)
else:
print(e)
finally:
file.close()
return testData
# FUnction for generating dynamic path which has text in XPATH
def generate_dynamic_xpath(self, locatorvalue, replacement):
after_replacement = locatorvalue.replace('#', replacement)
return after_replacement
# Function for creating report name
def create_report_name(self, testContent, date):
reportname = testContent.replace('date', date)
return reportname
# Function for reading config.properties file
#staticmethod
def get_config_testdata():
configpath = os.path.dirname(os.path.abspath(__file__))
print(configpath)
configpath1 = configpath.replace("Utils", "")
configpath = configpath1.replace(configpath1, "config.properties")
# configpath=configpath.replace("Utils", "config.properties")
print(configpath)
try:
file = open(configpath)
for line in file:
content = line.split("=")
firstArgument = content[0]
secondArgument = content[1]
a = firstArgument.rstrip('\n')
b = secondArgument.rstrip('\n')
configTestData[a] = b
except Exception as e:
if hasattr(e, 'Exception occured while reading properties file'):
print(e.message)
else:
print(e)
finally:
file.close()
return configTestData
# Function to format the path for different OS
#staticmethod
def OS_path_fromat_separator(pathformat):
config_data = {}
config_data = CustomLib.get_config_testdata()
if (config_data['OS'] == 'Windows'):
OSPath = pathformat.replace('$', '//')
return OSPath
else:
OSPath = pathformat.replace('$', '/')
return OSPath
# Function for generating dynamic CSS on the basis of text
def generate_dynamic_CSS(self, locatorvalue, replacement):
after_replacement = locatorvalue.replace('#', replacement)
return after_replacement
def main():
CustomLib.get_global_config_data()
# CustomLib.get_testData_From_PropertiesFile()
if __name__ == '__main__':
main()

I'm using relative path to import custom library. Below is the reference, how to use import using relative path. So this should work.
Library ..${/}foldername${/}customlibrary.py
And you can also use full path and import the library

Method cannot access class variable of different class

I am writing an algorithm in Python that is supposed to sort children (out of a database table) into one of their chosen kindergarten wishes (also out of a database table) following certain criteria on who to guarantee a place in their chosen kindergarten first. For this I first wrote a KitaDAO class to link the programme to the database and fetch information out of certain tables, saving them as an object.
import pymysql
import json
from Kita import Kita
from Kind import Kind
from Element import Element
class KitaDAO():
def __init__(self):
self.db = pymysql.connect("localhost","projekt","projekt","kita" )
self.cursor = self.db.cursor()
self.kitaList = []
self.kinderList = []
def getKitas(self):
self.sql = "SELECT * FROM kitas"
try:
self.cursor.execute(self.sql)
self.results = self.cursor.fetchall()
for row in self.results:
thisKita = Kita(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8])
self.kitaList.append(thisKita)
except Exception as e:
print (e)
return self.kitaList
def getWarteliste(self):
self.sql = "SELECT * FROM warteliste"
self.warteliste = []
try:
self.cursor.execute(self.sql)
self.results = self.cursor.fetchall()
for row in self.results:
thisElement = Element(row[0],row[1],row[2],row[3],row[4],row[5],row[6])
self.warteliste.append(thisElement)
except Exception as e:
print (e)
return self.warteliste
def getKinder(self):
self.sql = "SELECT * FROM kinderprofil"
try:
self.cursor.execute(self.sql)
self.results = self.cursor.fetchall()
for row in self.results:
thisKind = Kind(row[0],row[1],row[2],row[3],row[4],row[5],row[6])
self.kinderList.append(thisKind)
except Exception as e:
print (e)
return self.kinderList
def getKindOnWarteliste(self,kita,wunschnummer):
self.kinderList = []
self.warteliste = []
self.warteliste = self.getWarteliste()
if (wunschnummer == 1):
for i in self.warteliste:
if (kita == i.getWunsch1()):
self.kinderList.append(i.getKind())
elif (wunschnummer == 2):
for i in self.warteliste:
if (kita == i.getWunsch2()):
self.kinderList.append(i.getKind())
elif (wunschnummer == 3):
for i in self.warteliste:
if (kita == i.getWunsch3()):
self.kinderList.append(i.getKind())
else:
print("Error: Eine ungültige Wunschnummer wurde übergeben.")
return self.kinderList
If needed I can also post the classes Element, Kind and Kita in here but they basically only contain an __init__ method and if needed a get method. They also work, I have tested that before.
My problem is now, that in my main class called Sortierung I made thisDAO an instance of KitaDAO and want to use it to call methods and such, as normally. Sadly the class variable thisDAO is not accessible in a method of Sortierung. So basically this code has the response:
File "Sortierung.py", line 3, in <module> class Sortierung():
File "Sortierung.py", line 30, in Sortierung checkBetreuung(i,warteliste)
File "Sortierung.py", line 11, in checkBetreuung KinderObjektListe = thisDAO.getKinder()
nameError: name 'thisDAO' is not defined
I marked the lines in the code under here.
from KitaDAO import KitaDAO
class Sortierung(): #---------- This is line 3
kitas = []
thisDAO = KitaDAO()
kitas = thisDAO.getKitas()
def checkBetreuung(kita,kinderIDListe):
KinderObjektListe = []
KinderObjektListe = thisDAO.getKinder() #---------This is line 11
#left something out here that was irrelevant
for x in range(1,4):
for i in kitas:
warteliste = []
warteliste = thisDAO.getKindOnWarteliste(i.getID,x)
checkBetreuung(i,warteliste) #-------------This is line 30
Also BTW I am German that is why the variable names are all in German. Sorry :)

You don't need the Sortierung class at all (this is not Java; not everything needs to be encapsulated in a class) – the root problem is thisDAO ends up being a class attribute of it.
Something like
from KitaDAO import KitaDAO
thisDAO = KitaDAO()
kitas = thisDAO.getKitas()
def checkBetreuung(kita, kinderIDListe):
KinderObjektListe = thisDAO.getKinder()
for x in range(1,4):
for i in kitas:
warteliste = thisDAO.getKindOnWarteliste(i.getID(), x)
checkBetreuung(i, warteliste)
should do the trick, barring any other problems.

How to get the Worksheet ID from a Google Spreadsheet with python?

I'd like to identify a method to attain the Worksheet ID within the URL for each of the worksheets within a Google Spreadsheet Workbook. For example, the worksheet id for 'sheet2' of this workbook is '1244369280' , since it's url is https://docs.google.com/spreadsheets/d/1yd8qTYjRns4_OT8PbsZzH0zajvzguKS79dq6j--hnTs/edit#gid=1244369280
One method I've found is to pull the XML of a Google Spreadsheet, since according to this question, the only way to get the Worksheet ID is to stream down the XML of a worksheet, but the example is in Javascript and I need to do this in Python
This is the Javascript Code that I'd like to execute in Python:
Dim worksheetFeed As WorksheetFeed
Dim query As WorksheetQuery
Dim worksheet As WorksheetEntry
Dim output As New MemoryStream
Dim xml As String
Dim gid As String = String.Empty
Try
_service = New Spreadsheets.SpreadsheetsService("ServiceName")
_service.setUserCredentials(UserId, Password)
query = New WorksheetQuery(feedUrl)
worksheetFeed = _service.Query(query)
worksheet = worksheetFeed.Entries(0)
' Save worksheet feed to memory stream so we can
' get the xml returned from the feed url and look for
' the gid. Gid allows us to download the specific worksheet tab
Using output
worksheet.SaveToXml(output)
End Using
xml = Encoding.ASCII.GetString(output.ToArray())
It seems that the best way to get the XML from a Google Spreadsheet is using Gdata, so I've downloaded GData and tried the Google Spreadsheet example with my credentials.
See below
#!/usr/bin/python
#
# Copyright (C) 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__author__ = 'api.laurabeth#gmail.com (Laura Beth Lincoln)'
try:
from xml.etree import ElementTree
except ImportError:
from elementtree import ElementTree
import gdata.spreadsheet.service
import gdata.service
import atom.service
import gdata.spreadsheet
import atom
import getopt
import sys
import string
class SimpleCRUD:
def __init__(self, email, password):
self.gd_client = gdata.spreadsheet.service.SpreadsheetsService()
self.gd_client.email = 'chris#curalate.com'
self.gd_client.password = 'jkjkdioerzumawya'
self.gd_client.source = 'Spreadsheets GData Sample'
self.gd_client.ProgrammaticLogin()
self.curr_key = ''
self.curr_wksht_id = ''
self.list_feed = None
def _PromptForSpreadsheet(self):
# Get the list of spreadsheets
feed = self.gd_client.GetSpreadsheetsFeed()
self._PrintFeed(feed)
input = raw_input('\nSelection: ')
id_parts = feed.entry[string.atoi(input)].id.text.split('/')
self.curr_key = id_parts[len(id_parts) - 1]
def _PromptForWorksheet(self):
# Get the list of worksheets
feed = self.gd_client.GetWorksheetsFeed(self.curr_key)
self._PrintFeed(feed)
input = raw_input('\nSelection: ')
id_parts = feed.entry[string.atoi(input)].id.text.split('/')
self.curr_wksht_id = id_parts[len(id_parts) - 1]
def _PromptForCellsAction(self):
print ('dump\n'
'update {row} {col} {input_value}\n'
'\n')
input = raw_input('Command: ')
command = input.split(' ', 1)
if command[0] == 'dump':
self._CellsGetAction()
elif command[0] == 'update':
parsed = command[1].split(' ', 2)
if len(parsed) == 3:
self._CellsUpdateAction(parsed[0], parsed[1], parsed[2])
else:
self._CellsUpdateAction(parsed[0], parsed[1], '')
else:
self._InvalidCommandError(input)
def _PromptForListAction(self):
print ('dump\n'
'insert {row_data} (example: insert label=content)\n'
'update {row_index} {row_data}\n'
'delete {row_index}\n'
'Note: No uppercase letters in column names!\n'
'\n')
input = raw_input('Command: ')
command = input.split(' ' , 1)
if command[0] == 'dump':
self._ListGetAction()
elif command[0] == 'insert':
self._ListInsertAction(command[1])
elif command[0] == 'update':
parsed = command[1].split(' ', 1)
self._ListUpdateAction(parsed[0], parsed[1])
elif command[0] == 'delete':
self._ListDeleteAction(command[1])
else:
self._InvalidCommandError(input)
def _CellsGetAction(self):
# Get the feed of cells
feed = self.gd_client.GetCellsFeed(self.curr_key, self.curr_wksht_id)
self._PrintFeed(feed)
def _CellsUpdateAction(self, row, col, inputValue):
entry = self.gd_client.UpdateCell(row=row, col=col, inputValue=inputValue,
key=self.curr_key, wksht_id=self.curr_wksht_id)
if isinstance(entry, gdata.spreadsheet.SpreadsheetsCell):
print 'Updated!'
def _ListGetAction(self):
# Get the list feed
self.list_feed = self.gd_client.GetListFeed(self.curr_key, self.curr_wksht_id)
self._PrintFeed(self.list_feed)
def _ListInsertAction(self, row_data):
entry = self.gd_client.InsertRow(self._StringToDictionary(row_data),
self.curr_key, self.curr_wksht_id)
if isinstance(entry, gdata.spreadsheet.SpreadsheetsList):
print 'Inserted!'
def _ListUpdateAction(self, index, row_data):
self.list_feed = self.gd_client.GetListFeed(self.curr_key, self.curr_wksht_id)
entry = self.gd_client.UpdateRow(
self.list_feed.entry[string.atoi(index)],
self._StringToDictionary(row_data))
if isinstance(entry, gdata.spreadsheet.SpreadsheetsList):
print 'Updated!'
def _ListDeleteAction(self, index):
self.list_feed = self.gd_client.GetListFeed(self.curr_key, self.curr_wksht_id)
self.gd_client.DeleteRow(self.list_feed.entry[string.atoi(index)])
print 'Deleted!'
def _StringToDictionary(self, row_data):
dict = {}
for param in row_data.split():
temp = param.split('=')
dict[temp[0]] = temp[1]
return dict
def _PrintFeed(self, feed):
for i, entry in enumerate(feed.entry):
if isinstance(feed, gdata.spreadsheet.SpreadsheetsCellsFeed):
print '%s %s\n' % (entry.title.text, entry.content.text)
elif isinstance(feed, gdata.spreadsheet.SpreadsheetsListFeed):
print '%s %s %s' % (i, entry.title.text, entry.content.text)
# Print this row's value for each column (the custom dictionary is
# built using the gsx: elements in the entry.)
print 'Contents:'
for key in entry.custom:
print ' %s: %s' % (key, entry.custom[key].text)
print '\n',
else:
print '%s %s\n' % (i, entry.title.text)
def _InvalidCommandError(self, input):
print 'Invalid input: %s\n' % (input)
def Run(self):
self._PromptForSpreadsheet()
self._PromptForWorksheet()
input = raw_input('cells or list? ')
if input == 'cells':
while True:
self._PromptForCellsAction()
elif input == 'list':
while True:
self._PromptForListAction()
def main():
# parse command line options
try:
opts, args = getopt.getopt(sys.argv[1:], "", ["user=", "pw="])
except getopt.error, msg:
print 'python spreadsheetExample.py --user [username] --pw [password] '
sys.exit(2)
user = 'fake#gmail.com'
pw = 'fakepassword'
key = ''
# Process options
for o, a in opts:
if o == "--user":
user = a
elif o == "--pw":
pw = a
if user == '' or pw == '':
print 'python spreadsheetExample.py --user [username] --pw [password] '
sys.exit(2)
sample = SimpleCRUD(user, pw)
sample.Run()
if __name__ == '__main__':
main()
However this returns the following error:
Traceback (most recent call last):
File "/Users/Chris/Desktop/gdata_test.py", line 200, in <module>
main()
File "/Users/Chris/Desktop/gdata_test.py", line 196, in main
sample.Run()
File "/Users/Chris/Desktop/gdata_test.py", line 162, in Run
self._PromptForSpreadsheet()
File "/Users/Chris/Desktop/gdata_test.py", line 49, in _PromptForSpreadsheet
feed = self.gd_client.GetSpreadsheetsFeed()
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/gdata/spreadsheet/service.py", line 99, in GetSpreadsheetsFeed
converter=gdata.spreadsheet.SpreadsheetsSpreadsheetsFeedFromString)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/gdata/service.py", line 1074, in Get
return converter(result_body)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/gdata/spreadsheet/__init__.py", line 395, in SpreadsheetsSpreadsheetsFeedFromString
xml_string)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/atom/__init__.py", line 93, in optional_warn_function
return f(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/atom/__init__.py", line 127, in CreateClassFromXMLString
tree = ElementTree.fromstring(xml_string)
File "<string>", line 125, in XML
cElementTree.ParseError: no element found: line 1, column 0
[Finished in 0.3s with exit code 1]
[shell_cmd: python -u "/Users/Chris/Desktop/gdata_test.py"]
[dir: /Users/Chris/Desktop]
[path: /usr/bin:/bin:/usr/sbin:/sbin]
I should also mention that I've been using Gspread as a method to interact with Google Spreadsheets, but when I run the below code, I get the gid, but I need to have the worksheet id.
gc = gspread.authorize(credentials)
sh = gc.open_by_url('google_spreadsheet_url')
sh.get_id_fields()
>> {'spreadsheet_id': '1BgCEn-3Nor7UxOEPwD-qv8qXe7CaveJBrn9_Lcpo4W4','worksheet_id': 'oqitk0d'}

See the self.gd_client.ProgrammaticLogin() call - this is causing the major problem since it uses the "ClientLogin" authorization method which was first deprecated and later removed on April 20, 2015.
I would actually look into the more fresh and actively developed gspread module instead.
Here is a, somewhat insane, example demonstrating how to extract the actual "gid" value for a given spreadsheet and worksheet name. Note that you would first need to generate the JSON file with the OAuth credentials (I'm assuming you've already done that).
The code (added comments that would hopefully help to understand it):
import urlparse
import xml.etree.ElementTree as ET
import gspread
from oauth2client.service_account import ServiceAccountCredentials
SPREADSHEET_NAME = 'My Test Spreadsheet'
WORKSHEET_NAME = "Sheet2"
PATH_TO_JSON_KEYFILE = '/path/to/json/key/file.json'
NAMESPACES = {'ns0': 'http://www.w3.org/2005/Atom'}
SCOPES = ['https://spreadsheets.google.com/feeds']
# log in
credentials = ServiceAccountCredentials.from_json_keyfile_name(PATH_TO_JSON_KEYFILE, SCOPES)
gss_client = gspread.authorize(credentials)
# open spreadsheet
gss = gss_client.open(SPREADSHEET_NAME)
# extract the full feed url
root = gss._feed_entry
full_feed_url = next(elm.attrib["href"] for elm in root.findall("ns0:link", namespaces=NAMESPACES) if "full" in elm.attrib["href"])
# get the feed and extract the gid value for a given sheet name
response = gss_client.session.get(full_feed_url)
root = ET.fromstring(response.content)
sheet_entry = next(elm for elm in root.findall("ns0:entry", namespaces=NAMESPACES)
if elm.find("ns0:title", namespaces=NAMESPACES).text == WORKSHEET_NAME)
link = next(elm.attrib["href"] for elm in sheet_entry.findall("ns0:link", namespaces=NAMESPACES)
if "gid=" in elm.attrib["href"])
# extract "gid" from URL
gid = urlparse.parse_qs(urlparse.urlparse(link).query)["gid"][0]
print(gid)
It also looks like there is a way to convert the worksheet ID to a gid value, see:
How to convert Google spreadsheet's worksheet string id to integer index (GID)?

Jan 2017
You can use the new google spreadsheet api v4. You could take look at pygsheets library which uses api v4.
import pygsheets
#authorize the pygsheets
gc = pygsheets.authorize()
#open the spreadsheet
sh = gc.open('my new ssheet')
# get the worksheet and its id
print sh.worksheet_by_title("my test sheet").id

this seems to work for me using gspread
given a spreadsheet's worksheet url named 'mysheet1' that looks like this:
https://docs.google.com/spreadsheets/d/xxxxxf435454xxkjkjk23232325/edit#gid=645031900
this could be use to retrieve the gid value (aka: worksheet id or sheetid)
ss_key = xxxxxf435454xxkjkjk23232325
wks_name = mysheet1
gc.open_by_key('xxxxxf435454xxkjkjk23232325').worksheet('mysheet1').id
result:
645031900

Writing to an existing file without overwriting/erasing

My goal is quite simple, but I couldn't find it on the guide for configobj.
When I run my code I want it to write to a file but not erase what there's in the file already.
I would like everytime I run this it should write underneath what's already in the file
This is my current code: That erase/overwrite what's inside the dasd.ini already
from configobj import ConfigObj
config = ConfigObj()
config.filename = "dasd.ini"
#
config['hey'] = "value1"
config['test'] = "value2"
#
config['another']['them'] = "value4"
#
config.write()

this would be remarkably simpler if configobj accepted a file-like object instead of a file name. This is a solution i offered in comments.
import tempfile
with tempfile.NamedTemporaryFile() as t1, tempfile.NamedTemporaryFile() as t2, open('dasd.ini', 'w') as fyle:
config = ConfigObj()
config.filename = t1.file.name
config['hey'] = "value1"
config['test'] = "value2"
config['another']['them'] = "value4"
config.write()
do_your_thing_with_(t2)
t1.seek(0)
t2.seek(0)
fyle.write(t2.read())
fyle.write(t1.read())

If I understand your question correctly, doing what you want is a very simple change. Use the following syntax to create your initial config object. This reads in keys and values from the existing file.
config = ConfigObj("dasd.ini")
Then you can add new settings or change the existing ones as in your example code.
config['hey'] = "value1"
config['test'] = "value2"
After you write it out using config.write(), you'll find that your dasd.ini file contains the original and new keys/values merged. It also preserves any comments you had in your original ini file, with new keys/values added to the end of each section.
Check out this link, I found it to be quite helpful: An Introduction to ConfigObj

try it:
You have to read all keys and values of the section if the section existed already
and then write the whole section data
# -*- coding: cp950 -*-
import configobj
import os
#-------------------------------------------------------------------------
# _readINI(ini_file, szSection, szKey)
# read KeyValue from a ini file
# return True/False, KeyValue
#-------------------------------------------------------------------------
def _readINI(ini_file, szSection, szKey=None):
ret = None
keyvalue = None
if os.path.exists(ini_file) :
try:
config = configobj.ConfigObj(ini_file, encoding='UTF8')
if not szKey==None :
keyvalue = config[szSection][szKey]
else:
keyvalue = config[szSection]
ret = True
print keyvalue
except Exception, e :
ret = False
return ret, keyvalue
#-------------------------------------------------------------------------
# _writeINI(ini_file, szSection, szKey, szKeyValue):
# write key value into a ini file
# return True/False
# You have to read all keys and values of the section if the section existed already
# and then write the whole section data
#-------------------------------------------------------------------------
def _writeINI(ini_file, szSection, szKey, szKeyValue):
ret = False
try:
ret_section = _readINI(ini_file, szSection)
if not os.path.exists(ini_file) :
# create a new ini file with cfg header comment
CreateNewIniFile(ini_file)
config = configobj.ConfigObj(ini_file, encoding='UTF8')
if ret_section[1] == None :
config[szSection] = {}
else :
config[szSection] = ret_section[1]
config[szSection][szKey] = szKeyValue
config.write()
ret = True
except Exception, e :
print str(e)
return ret
#-------------------------------------------------------------------------
# CreateNewIniFile(ini_file)
# create a new ini with header comment
# return True/False
#-------------------------------------------------------------------------
def CreateNewIniFile(ini_file):
ret = False
try:
if not os.path.exists(ini_file) :
f= open(ini_file,'w+')
f.write('########################################################\n')
f.write('# Configuration File for Parallel Settings of Moldex3D #\n')
f.write('# Please Do Not Modify This File #\n')
f.write('########################################################\n')
f.write('\n\n')
f.close()
ret = True
except Exception, e :
print e
return ret
#----------------------------------------------------------------------
if __name__ == "__main__":
path = 'D:\\settings.cfg'
_writeINI(path, 'szSection', 'szKey', u'kdk12341 他dkdk')
_writeINI(path, 'szSection', 'szKey-1', u'kdk123412dk')
_writeINI(path, 'szSection', 'szKey-2', u'kfffk')
_writeINI(path, 'szSection', 'szKey-3', u'dhhhhhhhhhhhh')
_writeINI(path, 'szSection-333', 'ccc', u'555')
#_writeINI(path, 'szSection-222', '', u'')
print _readINI(path, 'szSection', 'szKey-2')
print _readINI(path, 'szSection-222')
#CreateNewIniFile(path)

split a pdf based on outline

i would like to use pyPdf to split a pdf file based on the outline where each destination in the outline refers to a different page within the pdf.
example outline:
main --> points to page 1
sect1 --> points to page 1
sect2 --> points to page 15
sect3 --> points to page 22
it is easy within pyPdf to iterate over each page of the document or each destination in the document's outline; however, i cannot figure out how to get the page number where the destination points.
does anybody know how to find the referencing page number for each destination in the outline?

I figured it out:
class Darrell(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result
pdf = Darrell(open(PATH-TO-PDF, 'rb'))
template = '%-5s %s'
print template % ('page', 'title')
for p,t in sorted([(v,k) for k,v in pdf.getDestinationPageNumbers().iteritems()]):
print template % (p+1,t)

This is just what I was looking for. Darrell's additions to PdfFileReader should be part of PyPDF2.
I wrote a little recipe that uses PyPDF2 and sejda-console to split a PDF by bookmarks. In my case there are several Level 1 sections that I want to keep together. This script allows me to do that and give the resulting files meaningful names.
import operator
import os
import subprocess
import sys
import time
import PyPDF2 as pyPdf
# need to have sejda-console installed
# change this to point to your installation
sejda = 'C:\\sejda-console-1.0.0.M2\\bin\\sejda-console.bat'
class Darrell(pyPdf.PdfFileReader):
...
if __name__ == '__main__':
t0= time.time()
# get the name of the file to split as a command line arg
pdfname = sys.argv[1]
# open up the pdf
pdf = Darrell(open(pdfname, 'rb'))
# build list of (pagenumbers, newFileNames)
splitlist = [(1,'FrontMatter')] # Customize name of first section
template = '%-5s %s'
print template % ('Page', 'Title')
print '-'*72
for t,p in sorted(pdf.getDestinationPageNumbers().iteritems(),
key=operator.itemgetter(1)):
# Customize this to get it to split where you want
if t.startswith('Chapter') or \
t.startswith('Preface') or \
t.startswith('References'):
print template % (p+1, t)
# this customizes how files are renamed
new = t.replace('Chapter ', 'Chapter')\
.replace(': ', '-')\
.replace(': ', '-')\
.replace(' ', '_')
splitlist.append((p+1, new))
# call sejda tools and split document
call = sejda
call += ' splitbypages'
call += ' -f "%s"'%pdfname
call += ' -o ./'
call += ' -n '
call += ' '.join([str(p) for p,t in splitlist[1:]])
print '\n', call
subprocess.call(call)
print '\nsejda-console has completed.\n\n'
# rename the split files
for p,t in splitlist:
old ='./%i_'%p + pdfname
new = './' + t + '.pdf'
print 'renaming "%s"\n to "%s"...'%(old, new),
try:
os.remove(new)
except OSError:
pass
try:
os.rename(old, new)
print' succeeded.\n'
except:
print' failed.\n'
print '\ndone. Spliting took %.2f seconds'%(time.time() - t0)

Small update to #darrell class to be able to parse UTF-8 outlines, which I post as answer because comment would be hard to read.
Problem is in pyPdf.pdf.Destination.title which may be returned in two flavors:
pyPdf.generic.TextStringObject
pyPdf.generic.ByteStringObject
so that output from _setup_outline_page_ids() function returns also two different types for title object, which fails with UnicodeDecodeError if outline title contains anything then ASCII.
I added this code to solve the problem:
if isinstance(title, pyPdf.generic.TextStringObject):
title = title.encode('utf-8')
of whole class:
class PdfOutline(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
if isinstance(title, pyPdf.generic.TextStringObject):
title = title.encode('utf-8')
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result

Darrell's class can be modified slightly to produce a multi-level table of contents for a pdf (in the manner of pdftoc in the pdftk toolkit.)
My modification adds one more parameter to _setup_page_id_to_num, an integer "level" which defaults to 1. Each invocation increments the level. Instead of storing just the page number in the result, we store the pair of page number and level. Appropriate modifications should be applied when using the returned result.
I am using this to implement the "PDF Hacks" browser-based page-at-a-time document viewer with a sidebar table of contents which reflects LaTeX section, subsection etc bookmarks. I am working on a shared system where pdftk can not be installed but where python is available.

A solution 10 years later for newer python and PyPDF:
from PyPDF2 import PdfReader, PdfWriter
filename = "main.pdf"
with open(filename, "rb") as f:
r = PdfReader(f)
bookmarks = list(map(lambda x: (x.title, r.get_destination_page_number(x)), r.outline))
print(bookmarks)
for i, b in enumerate(bookmarks):
begin = b[1]
end = bookmarks[i+1][1] if i < len(bookmarks) - 1 else len(r.pages)
# print(len(r.pages[begin:end]))
name = b[0] + ".pdf"
print(f"{name=}: {begin=}, {end=}")
with open(name, "wb") as f:
w = PdfWriter(f)
for p in r.pages[begin:end]:
w.add_page(p)
w.write(f)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python null check import xls - python

Related

Getting Import/Library issue in my robotframework

Method cannot access class variable of different class

How to get the Worksheet ID from a Google Spreadsheet with python?

Writing to an existing file without overwriting/erasing

split a pdf based on outline

Categories

Resources