Print specific lines from a text file in python - python
I have a text file which I'm trying to print the lines that don't start with a number but the output is not readable.
This is a part of what my code returns:
{\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf600
{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
{\colortbl;\red255\green255\blue255;\red4\green4\blue4;\red247\green247\blue247;\red0\green0\blue0;
\red255\green255\blue255;\red77\green77\blue77;}
{\*\expandedcolortbl;;\cssrgb\c1176\c1176\c1176;\cssrgb\c97647\c97647\c97647;\cssrgb\c0\c0\c0;
\cssrgb\c100000\c100000\c100000;\cssrgb\c37647\c37647\c37647;}
\paperw11900\paperh16840\margl1440\margr1440\vieww10800\viewh8400\viewkind0
\deftab720
\pard\pardeftab720\partightenfactor0
\f0\fs26 \cf2 \cb3 Since the start of digital video in 1988, new video formats are developed every year\cf4 \cb5 \
\pard\pardeftab720\partightenfactor0
\cf6 \cb3 00:14\cb5 \
And this is my code:
numbers = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")
aFile = open("/Users/maira/Desktop/text.rtf")
lines = aFile.readlines()
for line in lines:
if not line.startswith((numbers)):
print(line)
aFile.close()
This is an example of the original text:
Since the start of digital video in 1988, new video formats are developed every year
00:14
in an attempt to provide improvements in quality, file size and video playback.
00:18
The popularity of video continues to grow rapidly, with 78% of people watching at least
00:24
one digital video on one of their devices every single day; However video formats and
00:29
how they work is still a subject of much confusion for most people.
I've seen some questions similar to mine but I can't get to a solution.
I appreciate any advices and if there's also a way of deleting the blank lines in between lines, I'd be very thankful.
Thank you.
I used a very complete function provided on this answer to strip all the rtf text, and after that i use a regex to remove the format numbers HH:MM. Maybe this will help you.
def striprtf(text):
pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
# control words which specify a "destionation".
destinations = frozenset((
'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping',
'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap',
'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
'ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype',
'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr',
'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride',
'listoverridetable','listpicture','liststylename','listtable','listtext',
'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr',
'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr',
'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me',
'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr',
'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag',
'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname',
'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr',
'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject',
'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname',
'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl',
'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr',
'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu',
'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr',
'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup',
'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide',
'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol',
'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables',
'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops',
'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password',
'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta',
'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe',
'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst',
'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv',
'svb','tc','template','themedata','title','txe','ud','upr','userprops',
'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
'xmlopen',
))
# Translation of some special characters.
specialchars = {
'par': '\n',
'sect': '\n\n',
'page': '\n\n',
'line': '\n',
'tab': '\t',
'emdash': u'\u2014',
'endash': u'\u2013',
'emspace': u'\u2003',
'enspace': u'\u2002',
'qmspace': u'\u2005',
'bullet': u'\u2022',
'lquote': u'\u2018',
'rquote': u'\u2019',
'ldblquote': u'\201C',
'rdblquote': u'\u201D',
}
stack = []
ignorable = False # Whether this group (and all inside it) are "ignorable".
ucskip = 1 # Number of ASCII characters to skip after a unicode character.
curskip = 0 # Number of ASCII characters left to skip
out = [] # Output buffer.
for match in pattern.finditer(text):
word,arg,hex,char,brace,tchar = match.groups()
if brace:
curskip = 0
if brace == '{':
# Push state
stack.append((ucskip,ignorable))
elif brace == '}':
# Pop state
ucskip,ignorable = stack.pop()
elif char: # \x (not a letter)
curskip = 0
if char == '~':
if not ignorable:
out.append(u'\xA0')
elif char in '{}\\':
if not ignorable:
out.append(char)
elif char == '*':
ignorable = True
elif word: # \foo
curskip = 0
if word in destinations:
ignorable = True
elif ignorable:
pass
elif word in specialchars:
out.append(specialchars[word])
elif word == 'uc':
ucskip = int(arg)
elif word == 'u':
c = int(arg)
if c < 0: c += 0x10000
if c > 127: out.append(unichr(c))
else: out.append(chr(c))
curskip = ucskip
elif hex: # \'xx
if curskip > 0:
curskip -= 1
elif not ignorable:
c = int(hex,16)
if c > 127: out.append(unichr(c))
else: out.append(chr(c))
elif tchar:
if curskip > 0:
curskip -= 1
elif not ignorable:
out.append(tchar)
return ''.join(out)
with open('/Users/maira/Desktop/text.rtf', 'r') as file:
rtf = file.read()
text = striprtf(rtf)
text = re.sub('(0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]', '', text)
print(text)
file.close()
Related
Python Pandas Highlight single signs possible?
I am really new to the whole python development and have a question. I would like to achieve the following result: However, in my research I only found the possibility to change the style for a whole cell. I am doing a complete character by character comparison and would like to Colour the individual characters accordingly. Maybe this can also be done with Python (my VBA script is very slow). This is my python script till now: import pandas as pd import numpy as np path = "XXXXX" data = pd.read_csv(path, names=["Dir1", "Dir2", "File1", "File2", "Diff", "Line1", "A", "Line2", "B"], sep="|") line = 1 for ind in data.index: if data.A[ind] == data.B[ind]: var_ok = True else: #Work just with different Values var_ok = False var_length_A = len(str(data.A[ind])) var_length_B = len(str(data.B[ind])) #check length #A is longer if var_length_A > var_length_B: var_longer = var_length_A #Atos is longer elif var_length_A < var_length_B: var_longer = var_length_B #Same length else: var_longer = var_length_A for count in range(1,var_longer): #read every sign var_sign_A = mid(data.A[ind],count,1) var_sign_B = mid(data.B[ind],count,1) if var_sign_A != var_sign_B: #highlight this else: #Do nothing print([ind], "|\t", data.A[ind], "|\t", data.B[ind], "|\t", var_ok, "|\t", var_length_A, "|\t", var_length_B, "|\t", var_longer) This is a part from my VBA script: 'Wenn beide gefüllt sind Else counter = 1 'Zeichen für Zeichen abgleich For counter = counter To leng If Mid(Cells(zeile, Spalte1), counter, 1) <> Mid(Cells(zeile, Spalte2), counter, 1) Then With Cells(zeile, Spalte2).Characters(start:=counter, Length:=1).Font .Color = var2 .FontStyle = "Fett" End With With Cells(zeile, Spalte1).Characters(start:=counter, Length:=1).Font .Color = var2 .FontStyle = "Fett" End With Else With Cells(zeile, Spalte2).Characters(start:=counter, Length:=1).Font .Color = var1 .FontStyle = "Standard" End With With Cells(zeile, Spalte1).Characters(start:=counter, Length:=1).Font .Color = var1 .FontStyle = "Standard" End With End If Next End If End If BR & thank you :) Marcel
Depth first search algorithm skipping spaces in maze?
After concluding the first lecture of Harvard's AI course on edX, I have decided to implement the concepts taught, first being the depth-first search algorithm. The objective of this program is to input a maze in text file mazefile and find a path from S to G using the depth-first search algorithm. The project currently consists of 4 files, (1) the code with the class methods to operate or use the (2) text file which contains the maze, another text file (3) that contains the result file (where the AI has explored) and the main python script (4). Here they are, feel free to copy and paste these into a folder and to see how they run. processText.py (file 1) #code to process the mazefile file. class importMaze: def __init__(self,maze): self.fileLines = [] self.fileName = maze self.switch = False self.toBeReturned = [] def processThis(self): f = open(self.fileName,"r") for x in f: self.fileLines.append(x[:-1]) f.close() for i in self.fileLines: if self.switch == True: if str(i) == "END": self.switch = False else: self.toBeReturned.append(i) else: if str(i) == "START": self.switch = True return self.toBeReturned class mazePointer: def __init__(self,mazearray): self.Sample = mazearray self.initialPosition = [] for y in range(0, len(self.Sample)): for x in range(0,len(self.Sample[y])): if str(self.Sample[y][x]) == "S": self.initialPosition = [x,y] self.currentPosition = self.initialPosition def whatIs(self,xcoordinate,ycoordinate): return (self.Sample[ycoordinate])[xcoordinate] def nearbyFreeSpaces(self,search): self.freeSpaces = [] if self.whatIs(self.currentPosition[0]-1,self.currentPosition[1]) == search: self.freeSpaces.append([self.currentPosition[0]-1,self.currentPosition[1]]) if self.whatIs(self.currentPosition[0]+1,self.currentPosition[1]) == search: self.freeSpaces.append([self.currentPosition[0]+1,self.currentPosition[1]]) if self.whatIs(self.currentPosition[0],self.currentPosition[1]-1) == search: self.freeSpaces.append([self.currentPosition[0],self.currentPosition[1]-1]) if self.whatIs(self.currentPosition[1],self.currentPosition[1]+1) == search: self.freeSpaces.append([self.currentPosition[1],self.currentPosition[1]+1]) return self.freeSpaces def moveTo(self,position): self.currentPosition = position TestingTrack.py (the main file) from processText import importMaze, mazePointer testObject = importMaze("mazefile") environment = testObject.processThis() finger = mazePointer(environment) frontier = [] explored = [] result = "" def Search(): global result if len(finger.nearbyFreeSpaces("G")) == 1: #If the goal is bordering this space result = finger.nearbyFreeSpaces("G")[0] explored.append(finger.currentPosition) else: newPlaces = finger.nearbyFreeSpaces("F") #finds the free spaces bordering for i in newPlaces: if i in explored: #Skips the ones already visited pass else: frontier.append(i) while result == "": explored.append(finger.currentPosition) Search() finger.moveTo(frontier[-1]) frontier.pop(-1) exploredArray = [] for y in range(len(environment)): #Recreates the maze, fills in 'E' in where the AI has visited. holder = "" for x in range(len(environment[y])): if [x,y] in explored: holder+= "E" else: holder+= str(environment[y][x]) exploredArray.append(holder) def createResult(mazeList,title,append): #Creating the file file = open("resultfile",append) string = title + " \n F - Free \n O - Occupied \n S - Starting point \n G - Goal \n E - Explored/Visited \n (Abdulaziz Albastaki 2020) \n \n (top left coordinate - 0,0) \n " for i in exploredArray: string+= "\n" + str(i) string+= "\n \n Original problem \n" for i in environment: string+= "\n" +str(i) file.write(string) file.close() def tracingPath(): initialExplored = explored proceed = True newExplored = [] for i in explored: finger.moveTo() #incomplete print(explored) createResult(exploredArray,"DEPTH FIRST SEARCH", "w") mazefile (the program will read this file to get the maze) F - Free O - Occupied S - Starting point G - Goal (Abdulaziz Albastaki 2020) START OOOOOOOOOOOOOOOO OFFFFFFFFFFFFFGO OFOOOOOOOOOOOOFO OFOOOOOOOOOOOOFO OFOOOOOOOOOOOOFO OFOOOOOOOOOOOOFO OSFFFFFFFFFFFFFO OOOOOOOOOOOOOOOO END Made by Abdulaziz Albastaki in October 2020 You can change the maze and its size however it must -Respect the key above -Have ONE Starting point and goal -The maze must be in between 'START' and 'END' -The maze MUST be surrounded by occupied space SAMPLE PROBLEMS: OOOOOOOOOOOOOOOO OFFFFFFFFFFFFFGO OFOOOOOOOOOOOOFO OFOOOOOOOOOOOOFO OFOOOOOOOOOOOOFO OFOOOOOOOOOOOOFO OSFFFFFFFFFFFFFO OOOOOOOOOOOOOOOO OOOOOOOOOOOOOOOOO OFOFFFFFOOOFFFOOO OFFFOOOFOOFFOOOFO OFOOOOOFOOFOOOOFO OSFGFFFFFFFFFFFFO OOOOOOOOOOOOOOOOO There is also a resultfile, however if you would just create an empty textfile with that name (no extension), the program will fill it in with results. The problem is with the resultfile, here it is: DEPTH FIRST SEARCH F - Free O - Occupied S - Starting point G - Goal E - Explored/Visited (Abdulaziz Albastaki 2020) (top left coordinate - 0,0) OOOOOOOOOOOOOOOO OFFFFFFFFFFFFFGO OFOOOOOOOOOOOOEO OFOOOOOOOOOOOOEO OFOOOOOOOOOOOOEO OEOOOOOOOOOOOOEO OEFFFEEEEEEEEEEO OOOOOOOOOOOOOOOO Original problem OOOOOOOOOOOOOOOO OFFFFFFFFFFFFFGO OFOOOOOOOOOOOOFO OFOOOOOOOOOOOOFO OFOOOOOOOOOOOOFO OFOOOOOOOOOOOOFO OSFFFFFFFFFFFFFO OOOOOOOOOOOOOOOO The AI skipped a few spaces to get to the goal, why is it doing so? Feel free to ask me for any clarifications.
There are the following issues: the last if block in nearbyFreeSpaces uses a wrong index: if self.whatIs(self.currentPosition[1],self.currentPosition[1]+1) == search: self.freeSpaces.append([self.currentPosition[1],self.currentPosition[1]+1]) should be: if self.whatIs(self.currentPosition[0],self.currentPosition[1]+1) == search: self.freeSpaces.append([self.currentPosition[0],self.currentPosition[1]+1]) The final position is not correctly added to the path. The last line of this block: if len(finger.nearbyFreeSpaces("G")) == 1: #If the goal is bordering this space result = finger.nearbyFreeSpaces("G")[0] explored.append(finger.currentPosition) ...should be: explored.append(result)
Escape reserved characters in a list by adding backslash in front of it
reserved_chars = "? & | ! { } [ ] ( ) ^ ~ * : \ " ' + -" list_vals = ['gold-bear#gmail.com', 'P&G#dom.com', 'JACKSON! BOT', 'annoying\name'] What is that fastest way to loop through every element in a list and add a \ in front of the reserved character if one of the elements contains them? desired output: fixed_list = ['gold\-bear#gmail.com', 'P\&G#dom.com', 'JACKSON\! BOT', 'annoying\\name']
You could make a translation table with str.maketrans() and pass that into translate. This takes a little setup, but you can reuse the translation table and it's quite fast: reserved_chars = '''?&|!{}[]()^~*:\\"'+-''' list_vals = ['gold-bear#gmail.com', 'P&G#dom.com', 'JACKSON! BOT', 'annoying\\name'] # make trans table replace = ['\\' + l for l in reserved_chars] trans = str.maketrans(dict(zip(reserved_chars, replace))) # translate with trans table fixed_list = [s.translate(trans) for s in list_vals] print("\n".join(fixed_list)) Prints: gold\-bear#gmail.com P\&G#dom.com JACKSON\! BOT annoying\\name
There is no fast way - you got strings, strings are immuteable, you need to create new ones. Probably best way is to build your own translation dictionary and do the grunt work yourself: reserved = """? & | ! { } [ ] ( ) ^ ~ * : \ " ' + -""" tr = { c:f"\\{c}" for c in reserved} print(tr) data = ['gold-bear#gmail.com', 'P&G#dom.com', 'JACKSON! BOT', 'annoying\name'] transformed = [ ''.join(tr.get(letter,letter) for letter in word) for word in data] for word in transformed: print(word) Output: # translation dictionary {'?': '\\?', ' ': '\\ ', '&': '\\&', '|': '\\|', '!': '\\!', '{': '\\{', '}': '\\}', '[': '\\[', ']': '\\]', '(': '\\(', ')': '\\)', '^': '\\^', '~': '\\~', '*': '\\*', ':': '\\:', '\\': '\\\\', '"': '\\"', "'": "\\'", '+': '\\+', '-': '\\-'} # transformed strings gold\-bear#gmail.com P\&G#dom.com JACKSON\!\ BOT annoying ame Sidenotes: Your example missed to escape the space inside 'JACKSON\! BOT'. The repl() of the transformed list looks "wrongly" escaped because when printing it escapes each '\' itself again - whats being printed see wordlist
Definitely not the fastest, but could be the easiest to code. Make a regex that does it for you, and run re.sub, like this: import re reserved_chars = "?&|!{}[]()^~*:\\\"'+-" replace_regex = "([" + ''.join('\\x%x' % ord(x) for x in reserved_chars) + "])" list_vals = ['gold-bear#gmail.com', 'P&G#dom.com', 'JACKSON! BOT', r'annoying\name'] escaped_vals = [re.sub(replace_regex, r"\\\1", x) for x in list_vals] Again, just to clarify, regexes are SLOW.
Own implement encoding and decoding base64 files in Python
I have a problem with my own implementation of base64 encoding. I have achieved to get the code below. It only works for text files with the English Letters, I suppose. For instance pdf file is encoded and decoded, it differs single characters. def base64Encode(data): alphabet = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P","Q","R","S","T","U","V","W","X","Y","Z","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","0","1","2","3","4","5","6","7","8","9","+","/"] bit_str = "" base64_str = "" for char in data: bin_char = bin(char).lstrip("0b") bin_char = bin_char.zfill(8) bit_str += bin_char brackets = [bit_str[x:x+6] for x in range(0,len(bit_str),6)] for bracket in brackets: if(len(bracket) < 6): bracket = bracket + (6-len(bracket))*"0" base64_str += alphabet[int(bracket,2)] # print(brackets[-4:]) #if(bracket[-1:) #print(len(base64_str)) #if(len(base64_str) != 76): # base64_str += "=" return base64_str def base64Decode(text): alphabet = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","0","1","2","3","4","5","6","7","8","9","+","/"] bit_str = "" text_str = "" for char in text: if char in alphabet: bin_char = bin(alphabet.index(char)).lstrip("0b") bin_char = bin_char.zfill(6) bit_str += bin_char brackets = [bit_str[x:x+8] for x in range(0,len(bit_str),8)] for bracket in brackets: text_str += chr(int(bracket,2)) return text_str.encode("UTF-8") w = open("encode.txt", "w") with open("bla.txt", "rb") as f: byte = f.read(57) while byte: w.write(base64Encode(byte)) w.write("\n") byte = f.read(57) w.close() f.close() w = open("decode.txt", "wb") with open("encode.txt", "r") as f: byte = f.read(77) while byte: w.write(base64Decode(byte)) byte = f.read(77) w.close() f.close() In my opinion, this line "return text_str.encode (" UTF-8 ")" should be without decoding to UTF-8. However, if you leave only "return text_str", gets error: TypeError: 'str' does not support the buffer interface. bla.txt: Phil Mercer reports on Cyclone Pam which has ravaged the Pacific nation of Vanuatu. Video courtesy of YouTube/Isso Nihmei at 350.org Save the Children's Vanuatu country director Tom Skirrow said on Saturday: "The scene here this morning is complete devastation - houses are destroyed, trees are down, roads are blocked and people are wandering the streets looking for help. ĄŚĆŹŻÓ encode.txt UGhpbCBNZXJjZXIgcmVwb3J0cyBvbiBDeWNsb25lIFBhbSB3aGljaCBoYXMgcmF2YWdlZCB0aGUg UGFjaWZpYyBuYXRpb24gb2YgVmFudWF0dS4gVmlkZW8gY291cnRlc3kgb2YgWW91VHViZS9Jc3Nv IE5paG1laSBhdCAzNTAub3JnDQoNClNhdmUgdGhlIENoaWxkcmVuJ3MgVmFudWF0dSBjb3VudHJ5 IGRpcmVjdG9yIFRvbSBTa2lycm93IHNhaWQgb24gU2F0dXJkYXk6ICJUaGUgc2NlbmUgaGVyZSB0 aGlzIG1vcm5pbmcgaXMgY29tcGxldGUgZGV2YXN0YXRpb24gLSBob3VzZXMgYXJlIGRlc3Ryb3ll ZCwgdHJlZXMgYXJlIGRvd24sIHJvYWRzIGFyZSBibG9ja2VkIGFuZCBwZW9wbGUgYXJlIHdhbmRl cmluZyB0aGUgc3RyZWV0cyBsb29raW5nIGZvciBoZWxwLg0KDQrEhMWaxIbFucW7w5M decode.txt Phil Mercer reports on Cyclone Pam which has ravaged the Pacific nation of Vanuatu. Video courtesy of YouTube/Isso Nihmei at 350.org Save the Children's Vanuatu country director Tom Skirrow said on Saturday: "The scene here this morning is complete devastation - houses are destroyed, trees are down, roads are blocked and people are wandering the streets looking for help. ÄÅÄŹŻà The same text encoded by page: http://www.motobit.com/util/base64-decoder-encoder.asp UGhpbCBNZXJjZXIgcmVwb3J0cyBvbiBDeWNsb25lIFBhbSB3aGljaCBoYXMgcmF2YWdlZCB0aGUg UGFjaWZpYyBuYXRpb24gb2YgVmFudWF0dS4gVmlkZW8gY291cnRlc3kgb2YgWW91VHViZS9Jc3Nv IE5paG1laSBhdCAzNTAub3JnDQoNClNhdmUgdGhlIENoaWxkcmVuJ3MgVmFudWF0dSBjb3VudHJ5 IGRpcmVjdG9yIFRvbSBTa2lycm93IHNhaWQgb24gU2F0dXJkYXk6ICJUaGUgc2NlbmUgaGVyZSB0 aGlzIG1vcm5pbmcgaXMgY29tcGxldGUgZGV2YXN0YXRpb24gLSBob3VzZXMgYXJlIGRlc3Ryb3ll ZCwgdHJlZXMgYXJlIGRvd24sIHJvYWRzIGFyZSBibG9ja2VkIGFuZCBwZW9wbGUgYXJlIHdhbmRl cmluZyB0aGUgc3RyZWV0cyBsb29raW5nIGZvciBoZWxwLg0KDQrEhMWaxIbFucW7w5M= It is the same, except "=", which omitted to implement due to the error at the very beginning of the file. And sample originale file in pdf: %PDF-1.5 %µµµµ 1 0 obj <</Type/Catalog/Pages 2 0 R/Lang(pl-PL) /StructTreeRoot 8 0 R/MarkInfo<</Marked true>>>> endobj 2 0 obj <</Type/Pages/Count 1/Kids[ 3 0 R] >> endobj 3 0 obj <</Type/Page/Parent 2 0 R/Resources<</Font<</F1 5 0 R>>/ProcSet[/PDF/Text/ImageB/ImageC/ImageI] >>/MediaBox[ 0 0 595.32 841.92] /Contents 4 0 R/Group<</Type/Group/S/Transparency/CS/DeviceRGB>>/Tabs/S/StructParents 0>> endobj 4 0 obj <</Filter/FlateDecode/Length 110>> stream xœUÌ €#ྰï0QËÝ®Èiž?(†kb°hòý«ZD˜4ßÀΨ*;…¡xº ¨#“íªFrÄI!w…˜2ËQ81®D<™ÇS=Ó’léŠ82µ·>^åŒÊO- >[´SÀ endstream endobj 5 0 obj <</Type/Font/Subtype/TrueType/Name/F1/BaseFont/ABCDEE+Calibri/Encoding/WinAnsiEncoding/FontDescriptor 6 0 R/FirstChar 32/LastChar 97/Widths 15 0 R>> endobj 6 0 obj <</Type/FontDescriptor/FontName/ABCDEE+Calibri/Flags 32/ItalicAngle 0/Ascent 750/Descent -250/CapHeight 750/AvgWidth 521/MaxWidth 1743/FontWeight 400/XHeight 250/StemV 52/FontBBox[ -503 -250 1240 750] /FontFile2 16 0 R>> endobj 7 0 obj And after executing script: %PDF-1.5 %µµµµ 1 0 obj <</Type/Catalog/Pages 2 0 R/Lang(pl-PL) /StructTreeRoot 8 0 R/MarkInfo<</Marked true>>>> endobj 2 0 obj <</Type/Pages/Count 1/Kids[ 3 0 R] >> endobj 3 0 obj <</Type/Page/Parent 2 0 R/Resources<</Font<</F1 5 0 R>>/ProcSet[/PDF/Text/ImageB/ImageC/ImageI] >>/MediaBox[ 0 0 595.32 841.92] /Contents 4 0 R/Group<</Type/Group/S/Transparency/CS/DeviceRGB>>/Tabs/S/StructParents 0>> endobj 4 0 obj <</Filter/FlateDecode/Length 110>> stream xUÌ #ྰï0QËÝ®Èi?(kb°hòý«ZD4ßÀΨ*;¡xº ¨#íªFrÄI!w2ËQ81®D<ÇS=Ólé82µ·>^åÊO- >[´SÀ endstream endobj 5 0 obj <</Type/Font/Subtype/TrueType/Name/F1/BaseFont/ABCDEE+Calibri/Encoding/WinAnsiEncoding/FontDescriptor 6 0 R/FirstChar 32/LastChar 97/Widths 15 0 R>> endobj 6 0 obj <</Type/FontDescriptor/FontName/ABCDEE+Calibri/Flags 32/ItalicAngle 0/Ascent 750/Descent -250/CapHeight 750/AvgWidth 521/MaxWidth 1743/FontWeight 400/XHeight 250/StemV 52/FontBBox[ -503 -250 1240 750] /FontFile2 16 0 R>> endobj 7 0 obj The differences are for instance at the beginning of line 15 and 16. My goal is to load the file and encode it in base64 and then decode and obtain the same file. Fit for use. I suppose that the error is in the data read or write or encoding. Any suggestions?
My first suggestion is troubleshooting: Determine if you are failing to encode or decode properly or both. Encode the file using a working utility and with your app and compare. Decode a properly encoded file with your app and with a working utility and compare. Second suggestion: Deal with the data as individual bytes, not text that may be interpreted as UTF-8. Open the PDF file in binary mode. See Reading binary file in Python and looping over each byte on how to do that. Pass the raw bytes to your base64Encode. Do not use the bin function to convert from string to binary.
I was able to accomplish this task. Replace line .encode("UTF-8") on .encode ("latin-1") and It works at least for pdf files.
I've modified the original code. This works on text, PNG and PDF, I haven't tried other file types, but I expect it will work on them. #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Mar 16 07:38:19 2019 #author: tracyanne """ import os class Base64(): def __init__(self): ## We only need to do this once self.b64 = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P","Q","R","S","T","U","V","W","X","Y","Z","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","0","1","2","3","4","5","6","7","8","9","+","/"] def Encode(self, data): alphabet = self.b64 bit_str = "" base64_str = "" for char in data: bin_char = bin(char).lstrip("0b") bin_char = bin_char.zfill(8) bit_str += bin_char brackets = [bit_str[x:x+6] for x in range(0,len(bit_str),6)] for bracket in brackets: if(len(bracket) < 6): bracket = bracket + (6-len(bracket))*"0" base64_str += alphabet[int(bracket,2)] ##Add padding characters to maintain compatibility with forced padding padding_indicator = len(base64_str) % 4 if padding_indicator == 3: base64_str += "=" elif padding_indicator == 2: base64_str += "==" return base64_str def Decode(self, text, eof): alphabet = self.b64 bit_str = "" text_str = "" for char in text: if char in alphabet: bin_char = bin(alphabet.index(char)).lstrip("0b") bin_char = bin_char.zfill(6) bit_str += bin_char brackets = [bit_str[x:x+8] for x in range(0,len(bit_str),8)] for bracket in brackets: ## When eof ignore last value in brackets to remove \x00 if eof and brackets[len(brackets) -1] == bracket: pass else: text_str += chr(int(bracket,2)) ## encode string as Latin-1 == ISO-8859-1 return text_str.encode("ISO-8859-1") def base64Encode(self, inFile, outFile): w = open(outFile, "w") with open(inFile, "rb") as f: byte = f.read(57) while byte: w.write(self.Encode(byte)) w.write("\n") byte = f.read(57) w.close() f.close() def base64Decode(self, inFile, outFile): ## Get size of input file for later comparison fsize = os.path.getsize(inFile) incsize = 0 eof = False w = open(outFile, "wb") with open(inFile, "r") as f: byte = f.read(77) while byte: ## keep current dataread and if current data read == ## input file size set eof True incsize += len(byte) if fsize - incsize == 0: eof = True ## Pass in eof to Decode w.write(base64.base64Decode(byte, eof)) byte = f.read(77) w.close() f.close()
Python unified diff with line numbers from both "files"
I'm trying to figure out a way to create unified diffs with line numbers only showing N lines of context. I have been unable to do this with difflib.unified_diff. I need to show changes in both files. The closest I can come is using diff on the command line like so: /usr/bin/diff --unchanged-line-format=' %.2dn %L' --old-line-format="-%.2dn %L" --new-line-format="+%.2dn %L" file1.py file2.py BUT I only want to show N lines of context, and /usr/bin/diff doesn't seem to support context with a custom line format (eg. -U2 is not compatible with --line-format "conflicting output style options"). Below is an example of what I'd like to accomplish (the same output as the above diff, but only showing 1 line of context surrounding changes): +01: def renamed_function() -01: def original_function(): 02: +03: """ Neat stuff here """ 04: 21: +22: # Here's a new comment 23: 85: # Output the value of foo() +86: print "Foo is %s"%(foo()) -86: print foo() 87:
I was able to figure out something very close to what I wanted to do. It's slower than regular diff, though. Here's the entire code, from my project GitGate. def unified_diff(to_file_path, from_file_path, context=1): """ Returns a list of differences between two files based on some context. This is probably over-complicated. """ pat_diff = re.compile(r'## (.[0-9]+\,[0-9]+) (.[0-9]+,[0-9]+) ##') from_lines = [] if os.path.exists(from_file_path): from_fh = open(from_file_path,'r') from_lines = from_fh.readlines() from_fh.close() to_lines = [] if os.path.exists(to_file_path): to_fh = open(to_file_path,'r') to_lines = to_fh.readlines() to_fh.close() diff_lines = [] lines = difflib.unified_diff(to_lines, from_lines, n=context) for line in lines: if line.startswith('--') or line.startswith('++'): continue m = pat_diff.match(line) if m: left = m.group(1) right = m.group(2) lstart = left.split(',')[0][1:] rstart = right.split(',')[0][1:] diff_lines.append("## %s %s ##\n"%(left, right)) to_lnum = int(lstart) from_lnum = int(rstart) continue code = line[0] lnum = from_lnum if code == '-': lnum = to_lnum diff_lines.append("%s%.4d: %s"%(code, lnum, line[1:])) if code == '-': to_lnum += 1 elif code == '+': from_lnum += 1 else: to_lnum += 1 from_lnum += 1 return diff_lines