Print specific lines from a text file in python - python

I have a text file which I'm trying to print the lines that don't start with a number but the output is not readable.
This is a part of what my code returns:
{\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf600
{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
{\colortbl;\red255\green255\blue255;\red4\green4\blue4;\red247\green247\blue247;\red0\green0\blue0;
\red255\green255\blue255;\red77\green77\blue77;}
{\*\expandedcolortbl;;\cssrgb\c1176\c1176\c1176;\cssrgb\c97647\c97647\c97647;\cssrgb\c0\c0\c0;
\cssrgb\c100000\c100000\c100000;\cssrgb\c37647\c37647\c37647;}
\paperw11900\paperh16840\margl1440\margr1440\vieww10800\viewh8400\viewkind0
\deftab720
\pard\pardeftab720\partightenfactor0
\f0\fs26 \cf2 \cb3 Since the start of digital video in 1988, new video formats are developed every year\cf4 \cb5 \
\pard\pardeftab720\partightenfactor0
\cf6 \cb3 00:14\cb5 \
And this is my code:
numbers = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")
aFile = open("/Users/maira/Desktop/text.rtf")
lines = aFile.readlines()
for line in lines:
if not line.startswith((numbers)):
print(line)
aFile.close()
This is an example of the original text:
Since the start of digital video in 1988, new video formats are developed every year
00:14
in an attempt to provide improvements in quality, file size and video playback.
00:18
The popularity of video continues to grow rapidly, with 78% of people watching at least
00:24
one digital video on one of their devices every single day; However video formats and
00:29
how they work is still a subject of much confusion for most people.
I've seen some questions similar to mine but I can't get to a solution.
I appreciate any advices and if there's also a way of deleting the blank lines in between lines, I'd be very thankful.
Thank you.

I used a very complete function provided on this answer to strip all the rtf text, and after that i use a regex to remove the format numbers HH:MM. Maybe this will help you.
def striprtf(text):
pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
# control words which specify a "destionation".
destinations = frozenset((
'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping',
'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap',
'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
'ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype',
'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr',
'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride',
'listoverridetable','listpicture','liststylename','listtable','listtext',
'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr',
'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr',
'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me',
'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr',
'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag',
'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname',
'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr',
'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject',
'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname',
'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl',
'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr',
'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu',
'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr',
'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup',
'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide',
'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol',
'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables',
'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops',
'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password',
'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta',
'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe',
'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst',
'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv',
'svb','tc','template','themedata','title','txe','ud','upr','userprops',
'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
'xmlopen',
))
# Translation of some special characters.
specialchars = {
'par': '\n',
'sect': '\n\n',
'page': '\n\n',
'line': '\n',
'tab': '\t',
'emdash': u'\u2014',
'endash': u'\u2013',
'emspace': u'\u2003',
'enspace': u'\u2002',
'qmspace': u'\u2005',
'bullet': u'\u2022',
'lquote': u'\u2018',
'rquote': u'\u2019',
'ldblquote': u'\201C',
'rdblquote': u'\u201D',
}
stack = []
ignorable = False # Whether this group (and all inside it) are "ignorable".
ucskip = 1 # Number of ASCII characters to skip after a unicode character.
curskip = 0 # Number of ASCII characters left to skip
out = [] # Output buffer.
for match in pattern.finditer(text):
word,arg,hex,char,brace,tchar = match.groups()
if brace:
curskip = 0
if brace == '{':
# Push state
stack.append((ucskip,ignorable))
elif brace == '}':
# Pop state
ucskip,ignorable = stack.pop()
elif char: # \x (not a letter)
curskip = 0
if char == '~':
if not ignorable:
out.append(u'\xA0')
elif char in '{}\\':
if not ignorable:
out.append(char)
elif char == '*':
ignorable = True
elif word: # \foo
curskip = 0
if word in destinations:
ignorable = True
elif ignorable:
pass
elif word in specialchars:
out.append(specialchars[word])
elif word == 'uc':
ucskip = int(arg)
elif word == 'u':
c = int(arg)
if c < 0: c += 0x10000
if c > 127: out.append(unichr(c))
else: out.append(chr(c))
curskip = ucskip
elif hex: # \'xx
if curskip > 0:
curskip -= 1
elif not ignorable:
c = int(hex,16)
if c > 127: out.append(unichr(c))
else: out.append(chr(c))
elif tchar:
if curskip > 0:
curskip -= 1
elif not ignorable:
out.append(tchar)
return ''.join(out)
with open('/Users/maira/Desktop/text.rtf', 'r') as file:
rtf = file.read()
text = striprtf(rtf)
text = re.sub('(0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]', '', text)
print(text)
file.close()

Related

Python Pandas Highlight single signs possible?

I am really new to the whole python development and have a question. I would like to achieve the following result:
However, in my research I only found the possibility to change the style for a whole cell.
I am doing a complete character by character comparison and would like to Colour the individual characters accordingly.
Maybe this can also be done with Python (my VBA script is very slow).
This is my python script till now:
import pandas as pd
import numpy as np
path = "XXXXX"
data = pd.read_csv(path, names=["Dir1", "Dir2", "File1", "File2",
"Diff", "Line1", "A", "Line2", "B"], sep="|")
line = 1
for ind in data.index:
if data.A[ind] == data.B[ind]:
var_ok = True
else:
#Work just with different Values
var_ok = False
var_length_A = len(str(data.A[ind]))
var_length_B = len(str(data.B[ind]))
#check length
#A is longer
if var_length_A > var_length_B:
var_longer = var_length_A
#Atos is longer
elif var_length_A < var_length_B:
var_longer = var_length_B
#Same length
else:
var_longer = var_length_A
for count in range(1,var_longer):
#read every sign
var_sign_A = mid(data.A[ind],count,1)
var_sign_B = mid(data.B[ind],count,1)
if var_sign_A != var_sign_B:
#highlight this
else:
#Do nothing
print([ind], "|\t", data.A[ind], "|\t", data.B[ind], "|\t",
var_ok, "|\t", var_length_A, "|\t", var_length_B, "|\t", var_longer)
This is a part from my VBA script:
'Wenn beide gefüllt sind
Else
counter = 1
'Zeichen für Zeichen abgleich
For counter = counter To leng
If Mid(Cells(zeile, Spalte1), counter, 1) <> Mid(Cells(zeile, Spalte2), counter, 1) Then
With Cells(zeile, Spalte2).Characters(start:=counter, Length:=1).Font
.Color = var2
.FontStyle = "Fett"
End With
With Cells(zeile, Spalte1).Characters(start:=counter, Length:=1).Font
.Color = var2
.FontStyle = "Fett"
End With
Else
With Cells(zeile, Spalte2).Characters(start:=counter, Length:=1).Font
.Color = var1
.FontStyle = "Standard"
End With
With Cells(zeile, Spalte1).Characters(start:=counter, Length:=1).Font
.Color = var1
.FontStyle = "Standard"
End With
End If
Next
End If
End If
BR & thank you :)
Marcel

Depth first search algorithm skipping spaces in maze?

After concluding the first lecture of Harvard's AI course on edX, I have decided to implement the concepts taught, first being the depth-first search algorithm.
The objective of this program is to input a maze in text file mazefile and find a path from S to G using the depth-first search algorithm.
The project currently consists of 4 files, (1) the code with the class methods to operate or use the (2) text file which contains the maze, another text file (3) that contains the result file (where the AI has explored) and the main python script (4). Here they are, feel free to copy and paste these into a folder and to see how they run.
processText.py (file 1)
#code to process the mazefile file.
class importMaze:
def __init__(self,maze):
self.fileLines = []
self.fileName = maze
self.switch = False
self.toBeReturned = []
def processThis(self):
f = open(self.fileName,"r")
for x in f:
self.fileLines.append(x[:-1])
f.close()
for i in self.fileLines:
if self.switch == True:
if str(i) == "END":
self.switch = False
else:
self.toBeReturned.append(i)
else:
if str(i) == "START":
self.switch = True
return self.toBeReturned
class mazePointer:
def __init__(self,mazearray):
self.Sample = mazearray
self.initialPosition = []
for y in range(0, len(self.Sample)):
for x in range(0,len(self.Sample[y])):
if str(self.Sample[y][x]) == "S":
self.initialPosition = [x,y]
self.currentPosition = self.initialPosition
def whatIs(self,xcoordinate,ycoordinate):
return (self.Sample[ycoordinate])[xcoordinate]
def nearbyFreeSpaces(self,search):
self.freeSpaces = []
if self.whatIs(self.currentPosition[0]-1,self.currentPosition[1]) == search:
self.freeSpaces.append([self.currentPosition[0]-1,self.currentPosition[1]])
if self.whatIs(self.currentPosition[0]+1,self.currentPosition[1]) == search:
self.freeSpaces.append([self.currentPosition[0]+1,self.currentPosition[1]])
if self.whatIs(self.currentPosition[0],self.currentPosition[1]-1) == search:
self.freeSpaces.append([self.currentPosition[0],self.currentPosition[1]-1])
if self.whatIs(self.currentPosition[1],self.currentPosition[1]+1) == search:
self.freeSpaces.append([self.currentPosition[1],self.currentPosition[1]+1])
return self.freeSpaces
def moveTo(self,position):
self.currentPosition = position
TestingTrack.py (the main file)
from processText import importMaze, mazePointer
testObject = importMaze("mazefile")
environment = testObject.processThis()
finger = mazePointer(environment)
frontier = []
explored = []
result = ""
def Search():
global result
if len(finger.nearbyFreeSpaces("G")) == 1: #If the goal is bordering this space
result = finger.nearbyFreeSpaces("G")[0]
explored.append(finger.currentPosition)
else:
newPlaces = finger.nearbyFreeSpaces("F") #finds the free spaces bordering
for i in newPlaces:
if i in explored: #Skips the ones already visited
pass
else:
frontier.append(i)
while result == "":
explored.append(finger.currentPosition)
Search()
finger.moveTo(frontier[-1])
frontier.pop(-1)
exploredArray = []
for y in range(len(environment)): #Recreates the maze, fills in 'E' in where the AI has visited.
holder = ""
for x in range(len(environment[y])):
if [x,y] in explored:
holder+= "E"
else:
holder+= str(environment[y][x])
exploredArray.append(holder)
def createResult(mazeList,title,append): #Creating the file
file = open("resultfile",append)
string = title + " \n F - Free \n O - Occupied \n S - Starting point \n G - Goal \n E - Explored/Visited \n (Abdulaziz Albastaki 2020) \n \n (top left coordinate - 0,0) \n "
for i in exploredArray:
string+= "\n" + str(i)
string+= "\n \n Original problem \n"
for i in environment:
string+= "\n" +str(i)
file.write(string)
file.close()
def tracingPath():
initialExplored = explored
proceed = True
newExplored = []
for i in explored:
finger.moveTo() #incomplete
print(explored)
createResult(exploredArray,"DEPTH FIRST SEARCH", "w")
mazefile (the program will read this file to get the maze)
F - Free
O - Occupied
S - Starting point
G - Goal
(Abdulaziz Albastaki 2020)
START
OOOOOOOOOOOOOOOO
OFFFFFFFFFFFFFGO
OFOOOOOOOOOOOOFO
OFOOOOOOOOOOOOFO
OFOOOOOOOOOOOOFO
OFOOOOOOOOOOOOFO
OSFFFFFFFFFFFFFO
OOOOOOOOOOOOOOOO
END
Made by Abdulaziz Albastaki in October 2020
You can change the maze and its size however it must
-Respect the key above
-Have ONE Starting point and goal
-The maze must be in between 'START' and 'END'
-The maze MUST be surrounded by occupied space
SAMPLE PROBLEMS:
OOOOOOOOOOOOOOOO
OFFFFFFFFFFFFFGO
OFOOOOOOOOOOOOFO
OFOOOOOOOOOOOOFO
OFOOOOOOOOOOOOFO
OFOOOOOOOOOOOOFO
OSFFFFFFFFFFFFFO
OOOOOOOOOOOOOOOO
OOOOOOOOOOOOOOOOO
OFOFFFFFOOOFFFOOO
OFFFOOOFOOFFOOOFO
OFOOOOOFOOFOOOOFO
OSFGFFFFFFFFFFFFO
OOOOOOOOOOOOOOOOO
There is also a resultfile, however if you would just create an empty textfile with that name (no extension), the program will fill it in with results.
The problem is with the resultfile, here it is:
DEPTH FIRST SEARCH
F - Free
O - Occupied
S - Starting point
G - Goal
E - Explored/Visited
(Abdulaziz Albastaki 2020)
(top left coordinate - 0,0)
OOOOOOOOOOOOOOOO
OFFFFFFFFFFFFFGO
OFOOOOOOOOOOOOEO
OFOOOOOOOOOOOOEO
OFOOOOOOOOOOOOEO
OEOOOOOOOOOOOOEO
OEFFFEEEEEEEEEEO
OOOOOOOOOOOOOOOO
Original problem
OOOOOOOOOOOOOOOO
OFFFFFFFFFFFFFGO
OFOOOOOOOOOOOOFO
OFOOOOOOOOOOOOFO
OFOOOOOOOOOOOOFO
OFOOOOOOOOOOOOFO
OSFFFFFFFFFFFFFO
OOOOOOOOOOOOOOOO
The AI skipped a few spaces to get to the goal, why is it doing so?
Feel free to ask me for any clarifications.
There are the following issues:
the last if block in nearbyFreeSpaces uses a wrong index:
if self.whatIs(self.currentPosition[1],self.currentPosition[1]+1) == search:
self.freeSpaces.append([self.currentPosition[1],self.currentPosition[1]+1])
should be:
if self.whatIs(self.currentPosition[0],self.currentPosition[1]+1) == search:
self.freeSpaces.append([self.currentPosition[0],self.currentPosition[1]+1])
The final position is not correctly added to the path. The last line of this block:
if len(finger.nearbyFreeSpaces("G")) == 1: #If the goal is bordering this space
result = finger.nearbyFreeSpaces("G")[0]
explored.append(finger.currentPosition)
...should be:
explored.append(result)

Escape reserved characters in a list by adding backslash in front of it

reserved_chars = "? & | ! { } [ ] ( ) ^ ~ * : \ " ' + -"
list_vals = ['gold-bear#gmail.com', 'P&G#dom.com', 'JACKSON! BOT', 'annoying\name']
What is that fastest way to loop through every element in a list and add a \ in front of the reserved character if one of the elements contains them?
desired output:
fixed_list = ['gold\-bear#gmail.com', 'P\&G#dom.com', 'JACKSON\! BOT', 'annoying\\name']
You could make a translation table with str.maketrans() and pass that into translate. This takes a little setup, but you can reuse the translation table and it's quite fast:
reserved_chars = '''?&|!{}[]()^~*:\\"'+-'''
list_vals = ['gold-bear#gmail.com', 'P&G#dom.com', 'JACKSON! BOT', 'annoying\\name']
# make trans table
replace = ['\\' + l for l in reserved_chars]
trans = str.maketrans(dict(zip(reserved_chars, replace)))
# translate with trans table
fixed_list = [s.translate(trans) for s in list_vals]
print("\n".join(fixed_list))
Prints:
gold\-bear#gmail.com
P\&G#dom.com
JACKSON\! BOT
annoying\\name
There is no fast way - you got strings, strings are immuteable, you need to create new ones.
Probably best way is to build your own translation dictionary and do the grunt work yourself:
reserved = """? & | ! { } [ ] ( ) ^ ~ * : \ " ' + -"""
tr = { c:f"\\{c}" for c in reserved}
print(tr)
data = ['gold-bear#gmail.com', 'P&G#dom.com', 'JACKSON! BOT', 'annoying\name']
transformed = [ ''.join(tr.get(letter,letter) for letter in word) for word in data]
for word in transformed:
print(word)
Output:
# translation dictionary
{'?': '\\?', ' ': '\\ ', '&': '\\&', '|': '\\|', '!': '\\!', '{': '\\{',
'}': '\\}', '[': '\\[', ']': '\\]', '(': '\\(', ')': '\\)', '^': '\\^',
'~': '\\~', '*': '\\*', ':': '\\:', '\\': '\\\\', '"': '\\"', "'": "\\'",
'+': '\\+', '-': '\\-'}
# transformed strings
gold\-bear#gmail.com
P\&G#dom.com
JACKSON\!\ BOT
annoying
ame
Sidenotes:
Your example missed to escape the space inside 'JACKSON\! BOT'.
The repl() of the transformed list looks "wrongly" escaped because when printing it escapes each '\' itself again - whats being printed see wordlist
Definitely not the fastest, but could be the easiest to code. Make a regex that does it for you, and run re.sub, like this:
import re
reserved_chars = "?&|!{}[]()^~*:\\\"'+-"
replace_regex = "([" + ''.join('\\x%x' % ord(x) for x in reserved_chars) + "])"
list_vals = ['gold-bear#gmail.com', 'P&G#dom.com', 'JACKSON! BOT', r'annoying\name']
escaped_vals = [re.sub(replace_regex, r"\\\1", x) for x in list_vals]
Again, just to clarify, regexes are SLOW.

Own implement encoding and decoding base64 files in Python

I have a problem with my own implementation of base64 encoding. I have achieved to get the code below. It only works for text files with the English Letters, I suppose. For instance pdf file is encoded and decoded, it differs single characters.
def base64Encode(data):
alphabet = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P","Q","R","S","T","U","V","W","X","Y","Z","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","0","1","2","3","4","5","6","7","8","9","+","/"]
bit_str = ""
base64_str = ""
for char in data:
bin_char = bin(char).lstrip("0b")
bin_char = bin_char.zfill(8)
bit_str += bin_char
brackets = [bit_str[x:x+6] for x in range(0,len(bit_str),6)]
for bracket in brackets:
if(len(bracket) < 6):
bracket = bracket + (6-len(bracket))*"0"
base64_str += alphabet[int(bracket,2)]
# print(brackets[-4:])
#if(bracket[-1:)
#print(len(base64_str))
#if(len(base64_str) != 76):
# base64_str += "="
return base64_str
def base64Decode(text):
alphabet = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","0","1","2","3","4","5","6","7","8","9","+","/"]
bit_str = ""
text_str = ""
for char in text:
if char in alphabet:
bin_char = bin(alphabet.index(char)).lstrip("0b")
bin_char = bin_char.zfill(6)
bit_str += bin_char
brackets = [bit_str[x:x+8] for x in range(0,len(bit_str),8)]
for bracket in brackets:
text_str += chr(int(bracket,2))
return text_str.encode("UTF-8")
w = open("encode.txt", "w")
with open("bla.txt", "rb") as f:
byte = f.read(57)
while byte:
w.write(base64Encode(byte))
w.write("\n")
byte = f.read(57)
w.close()
f.close()
w = open("decode.txt", "wb")
with open("encode.txt", "r") as f:
byte = f.read(77)
while byte:
w.write(base64Decode(byte))
byte = f.read(77)
w.close()
f.close()
In my opinion, this line "return text_str.encode (" UTF-8 ")" should be without decoding to UTF-8. However, if you leave only "return text_str", gets error: TypeError: 'str' does not support the buffer interface.
bla.txt:
Phil Mercer reports on Cyclone Pam which has ravaged the Pacific nation of Vanuatu. Video courtesy of YouTube/Isso Nihmei at 350.org
Save the Children's Vanuatu country director Tom Skirrow said on Saturday: "The scene here this morning is complete devastation - houses are destroyed, trees are down, roads are blocked and people are wandering the streets looking for help.
ĄŚĆŹŻÓ
encode.txt
UGhpbCBNZXJjZXIgcmVwb3J0cyBvbiBDeWNsb25lIFBhbSB3aGljaCBoYXMgcmF2YWdlZCB0aGUg
UGFjaWZpYyBuYXRpb24gb2YgVmFudWF0dS4gVmlkZW8gY291cnRlc3kgb2YgWW91VHViZS9Jc3Nv
IE5paG1laSBhdCAzNTAub3JnDQoNClNhdmUgdGhlIENoaWxkcmVuJ3MgVmFudWF0dSBjb3VudHJ5
IGRpcmVjdG9yIFRvbSBTa2lycm93IHNhaWQgb24gU2F0dXJkYXk6ICJUaGUgc2NlbmUgaGVyZSB0
aGlzIG1vcm5pbmcgaXMgY29tcGxldGUgZGV2YXN0YXRpb24gLSBob3VzZXMgYXJlIGRlc3Ryb3ll
ZCwgdHJlZXMgYXJlIGRvd24sIHJvYWRzIGFyZSBibG9ja2VkIGFuZCBwZW9wbGUgYXJlIHdhbmRl
cmluZyB0aGUgc3RyZWV0cyBsb29raW5nIGZvciBoZWxwLg0KDQrEhMWaxIbFucW7w5M
decode.txt
Phil Mercer reports on Cyclone Pam which has ravaged the Pacific nation of Vanuatu. Video courtesy of YouTube/Isso Nihmei at 350.org
Save the Children's Vanuatu country director Tom Skirrow said on Saturday: "The scene here this morning is complete devastation - houses are destroyed, trees are down, roads are blocked and people are wandering the streets looking for help.
ÄÅÄŹŻÃ
The same text encoded by page: http://www.motobit.com/util/base64-decoder-encoder.asp
UGhpbCBNZXJjZXIgcmVwb3J0cyBvbiBDeWNsb25lIFBhbSB3aGljaCBoYXMgcmF2YWdlZCB0aGUg
UGFjaWZpYyBuYXRpb24gb2YgVmFudWF0dS4gVmlkZW8gY291cnRlc3kgb2YgWW91VHViZS9Jc3Nv
IE5paG1laSBhdCAzNTAub3JnDQoNClNhdmUgdGhlIENoaWxkcmVuJ3MgVmFudWF0dSBjb3VudHJ5
IGRpcmVjdG9yIFRvbSBTa2lycm93IHNhaWQgb24gU2F0dXJkYXk6ICJUaGUgc2NlbmUgaGVyZSB0
aGlzIG1vcm5pbmcgaXMgY29tcGxldGUgZGV2YXN0YXRpb24gLSBob3VzZXMgYXJlIGRlc3Ryb3ll
ZCwgdHJlZXMgYXJlIGRvd24sIHJvYWRzIGFyZSBibG9ja2VkIGFuZCBwZW9wbGUgYXJlIHdhbmRl
cmluZyB0aGUgc3RyZWV0cyBsb29raW5nIGZvciBoZWxwLg0KDQrEhMWaxIbFucW7w5M=
It is the same, except "=", which omitted to implement due to the error at the very beginning of the file.
And sample originale file in pdf:
%PDF-1.5
%µµµµ
1 0 obj
<</Type/Catalog/Pages 2 0 R/Lang(pl-PL) /StructTreeRoot 8 0 R/MarkInfo<</Marked true>>>>
endobj
2 0 obj
<</Type/Pages/Count 1/Kids[ 3 0 R] >>
endobj
3 0 obj
<</Type/Page/Parent 2 0 R/Resources<</Font<</F1 5 0 R>>/ProcSet[/PDF/Text/ImageB/ImageC/ImageI] >>/MediaBox[ 0 0 595.32 841.92] /Contents 4 0 R/Group<</Type/Group/S/Transparency/CS/DeviceRGB>>/Tabs/S/StructParents 0>>
endobj
4 0 obj
<</Filter/FlateDecode/Length 110>>
stream
xœUÌ­
€#ྰï0QËÝ®Èiž?(†kb°hòý«ZD˜4ßÀΨ*;…¡xº ¨#“íªFrÄI!w…˜2ËQ81®D<™ÇS=Ó’léŠ82µ·>^åŒÊO- >[´SÀ
endstream
endobj
5 0 obj
<</Type/Font/Subtype/TrueType/Name/F1/BaseFont/ABCDEE+Calibri/Encoding/WinAnsiEncoding/FontDescriptor 6 0 R/FirstChar 32/LastChar 97/Widths 15 0 R>>
endobj
6 0 obj
<</Type/FontDescriptor/FontName/ABCDEE+Calibri/Flags 32/ItalicAngle 0/Ascent 750/Descent -250/CapHeight 750/AvgWidth 521/MaxWidth 1743/FontWeight 400/XHeight 250/StemV 52/FontBBox[ -503 -250 1240 750] /FontFile2 16 0 R>>
endobj
7 0 obj
And after executing script:
%PDF-1.5
%µµµµ
1 0 obj
<</Type/Catalog/Pages 2 0 R/Lang(pl-PL) /StructTreeRoot 8 0 R/MarkInfo<</Marked true>>>>
endobj
2 0 obj
<</Type/Pages/Count 1/Kids[ 3 0 R] >>
endobj
3 0 obj
<</Type/Page/Parent 2 0 R/Resources<</Font<</F1 5 0 R>>/ProcSet[/PDF/Text/ImageB/ImageC/ImageI] >>/MediaBox[ 0 0 595.32 841.92] /Contents 4 0 R/Group<</Type/Group/S/Transparency/CS/DeviceRGB>>/Tabs/S/StructParents 0>>
endobj
4 0 obj
<</Filter/FlateDecode/Length 110>>
stream
xUÌ­
#ྰï0QËÝ®Èi?(kb°hòý«ZD4ßÀΨ*;¡xº ¨#íªFrÄI!w2ËQ81®D<ÇS=Ólé82µ·>^åÊO- >[´SÀ
endstream
endobj
5 0 obj
<</Type/Font/Subtype/TrueType/Name/F1/BaseFont/ABCDEE+Calibri/Encoding/WinAnsiEncoding/FontDescriptor 6 0 R/FirstChar 32/LastChar 97/Widths 15 0 R>>
endobj
6 0 obj
<</Type/FontDescriptor/FontName/ABCDEE+Calibri/Flags 32/ItalicAngle 0/Ascent 750/Descent -250/CapHeight 750/AvgWidth 521/MaxWidth 1743/FontWeight 400/XHeight 250/StemV 52/FontBBox[ -503 -250 1240 750] /FontFile2 16 0 R>>
endobj
7 0 obj
The differences are for instance at the beginning of line 15 and 16.
My goal is to load the file and encode it in base64 and then decode and obtain the same file. Fit for use.
I suppose that the error is in the data read or write or encoding. Any suggestions?
My first suggestion is troubleshooting: Determine if you are failing to encode or decode properly or both. Encode the file using a working utility and with your app and compare. Decode a properly encoded file with your app and with a working utility and compare.
Second suggestion: Deal with the data as individual bytes, not text that may be interpreted as UTF-8.
Open the PDF file in binary mode. See Reading binary file in Python and looping over each byte on how to do that. Pass the raw bytes to your base64Encode. Do not use the bin function to convert from string to binary.
I was able to accomplish this task. Replace line .encode("UTF-8") on .encode ("latin-1") and It works at least for pdf files.
I've modified the original code. This works on text, PNG and PDF, I haven't tried other file types, but I expect it will work on them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 16 07:38:19 2019
#author: tracyanne
"""
import os
class Base64():
def __init__(self):
## We only need to do this once
self.b64 = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P","Q","R","S","T","U","V","W","X","Y","Z","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","0","1","2","3","4","5","6","7","8","9","+","/"]
def Encode(self, data):
alphabet = self.b64
bit_str = ""
base64_str = ""
for char in data:
bin_char = bin(char).lstrip("0b")
bin_char = bin_char.zfill(8)
bit_str += bin_char
brackets = [bit_str[x:x+6] for x in range(0,len(bit_str),6)]
for bracket in brackets:
if(len(bracket) < 6):
bracket = bracket + (6-len(bracket))*"0"
base64_str += alphabet[int(bracket,2)]
##Add padding characters to maintain compatibility with forced padding
padding_indicator = len(base64_str) % 4
if padding_indicator == 3:
base64_str += "="
elif padding_indicator == 2:
base64_str += "=="
return base64_str
def Decode(self, text, eof):
alphabet = self.b64
bit_str = ""
text_str = ""
for char in text:
if char in alphabet:
bin_char = bin(alphabet.index(char)).lstrip("0b")
bin_char = bin_char.zfill(6)
bit_str += bin_char
brackets = [bit_str[x:x+8] for x in range(0,len(bit_str),8)]
for bracket in brackets:
## When eof ignore last value in brackets to remove \x00
if eof and brackets[len(brackets) -1] == bracket:
pass
else:
text_str += chr(int(bracket,2))
## encode string as Latin-1 == ISO-8859-1
return text_str.encode("ISO-8859-1")
def base64Encode(self, inFile, outFile):
w = open(outFile, "w")
with open(inFile, "rb") as f:
byte = f.read(57)
while byte:
w.write(self.Encode(byte))
w.write("\n")
byte = f.read(57)
w.close()
f.close()
def base64Decode(self, inFile, outFile):
## Get size of input file for later comparison
fsize = os.path.getsize(inFile)
incsize = 0
eof = False
w = open(outFile, "wb")
with open(inFile, "r") as f:
byte = f.read(77)
while byte:
## keep current dataread and if current data read ==
## input file size set eof True
incsize += len(byte)
if fsize - incsize == 0:
eof = True
## Pass in eof to Decode
w.write(base64.base64Decode(byte, eof))
byte = f.read(77)
w.close()
f.close()

Python unified diff with line numbers from both "files"

I'm trying to figure out a way to create unified diffs with line numbers only showing N lines of context. I have been unable to do this with difflib.unified_diff. I need to show changes in both files.
The closest I can come is using diff on the command line like so:
/usr/bin/diff
--unchanged-line-format=' %.2dn %L'
--old-line-format="-%.2dn %L"
--new-line-format="+%.2dn %L"
file1.py
file2.py
BUT I only want to show N lines of context, and /usr/bin/diff doesn't seem to support context with a custom line format (eg. -U2 is not compatible with --line-format "conflicting output style options").
Below is an example of what I'd like to accomplish (the same output as the above diff, but only showing 1 line of context surrounding changes):
+01: def renamed_function()
-01: def original_function():
02:
+03: """ Neat stuff here """
04:
21:
+22: # Here's a new comment
23:
85: # Output the value of foo()
+86: print "Foo is %s"%(foo())
-86: print foo()
87:
I was able to figure out something very close to what I wanted to do. It's slower than regular diff, though. Here's the entire code, from my project GitGate.
def unified_diff(to_file_path, from_file_path, context=1):
""" Returns a list of differences between two files based
on some context. This is probably over-complicated. """
pat_diff = re.compile(r'## (.[0-9]+\,[0-9]+) (.[0-9]+,[0-9]+) ##')
from_lines = []
if os.path.exists(from_file_path):
from_fh = open(from_file_path,'r')
from_lines = from_fh.readlines()
from_fh.close()
to_lines = []
if os.path.exists(to_file_path):
to_fh = open(to_file_path,'r')
to_lines = to_fh.readlines()
to_fh.close()
diff_lines = []
lines = difflib.unified_diff(to_lines, from_lines, n=context)
for line in lines:
if line.startswith('--') or line.startswith('++'):
continue
m = pat_diff.match(line)
if m:
left = m.group(1)
right = m.group(2)
lstart = left.split(',')[0][1:]
rstart = right.split(',')[0][1:]
diff_lines.append("## %s %s ##\n"%(left, right))
to_lnum = int(lstart)
from_lnum = int(rstart)
continue
code = line[0]
lnum = from_lnum
if code == '-':
lnum = to_lnum
diff_lines.append("%s%.4d: %s"%(code, lnum, line[1:]))
if code == '-':
to_lnum += 1
elif code == '+':
from_lnum += 1
else:
to_lnum += 1
from_lnum += 1
return diff_lines

Categories

Resources