How to find byte sequence in file? - python

I have a binary file, in which I need to change certain bit.
That bit's byte's address is relative to some byte sequence (some ASCII string):
content = array('B')
with open(filename, mode="r+b") as file:
content.fromfile(file, os.fstat(file.fileno()).st_size)
abc = [ord(letter) for letter in "ABC"]
i = content.index(abc) // ValueError: array.index(x): x not in list
content[i + 0x16] |= 1
content.tofile(file)
However as I must confess to my shame, that after Googling far and wide, I couldn't find the method to get the index of that "ABC" string...
Sure, I can write a function that does it with loops, but I can't believe there is no one-liner (OK, even two...) that accomplishes it.
How can it be done?

Not sure if this is the most Pythonic way, but this works. In this file
$ cat so.bin
���ABC̻�X��w
$ hexdump so.bin
0000000 eeff 41dd 4342 bbcc 58aa 8899 0a77
000000e
Edit: New solution starts here.
import string
char_ints = [ord(c) for c in string.ascii_letters]
with open("so.out.bin", "wb") as fo:
with open("so.bin", "rb") as fi:
# Read bytes but only keep letters.
chars = []
for b in fi.read():
if b in char_ints:
chars.append(chr(b))
else:
chars.append(" ")
# Search for 'ABC' in the read letters.
pos = "".join(chars).index("ABC")
# We now know the position of the intersting byte.
pos_x = pos + len("ABC") + 3 # known offset
# Now copy all bytes from the input to the output, ...
fi.seek(0)
i = 0
for b in fi.read():
# ... but replace the intersting byte.
if i == pos_x:
fo.write(b"Y")
else:
fo.write(bytes([b]))
i = i + 1
Edit: New solution ends here.
I want to get the X four positions after ABC. A little state keeping locates the position of ABC, skips the offset, prints the interesting bytes.
foundA = False
foundB = False
foundC = False
found = False
offsetAfterC = 3
lengthAfterC = 1
with open("so.bin", "rb") as f:
pos = 0
for b in f.read():
pos = pos + 1
if not found:
if b == 0x41:
foundA = True
elif foundA and b == 0x42:
foundB = True
elif foundA and foundB and b == 0x43:
foundC = True
else:
foundA, foundB, foundC = False, False, False
if foundA and foundB and foundC:
found = True
break
f.seek(0)
i = 0
while i < pos + offsetAfterC:
b = f.read(1)
i = i + 1
while i < pos + offsetAfterC + lengthAfterC:
b = f.read(1)
print(hex(int.from_bytes(b, byteorder="big")))
i = i + 1
Output:
0x58

Related

ValueError: substring not found on lip reading code

This is what I have gotten while trying to run step 3 of this source code:
https://github.com/carykh/lazykh
Error:
Traceback (most recent call last):
File "C:\Users\User\Desktop\lazykh-main\code\scheduler.py", line 93, in
OS_nextIndex = originalScript.index(wordString,OS_IndexAt)+len(wordString)
ValueError: substring not found
Code:
import argparse
import os.path
import json
import numpy as np
import random
def addPhoneme(p, t):
global prevPhoneme
global f
if p != prevPhoneme:
strings[4] += (str.format('{0:.3f}', t)+",phoneme,"+p+"\n")
prevPhoneme = p
def pickNewPose(t):
global pose
global prevPose
global POSE_COUNT
global prevPhoneme
global f
newPose = -1
while newPose == -1 or newPose == pose or newPose == prevPose:
newPose = int(random.random()*POSE_COUNT)
prevPose = pose
pose = newPose
strings[3] += (str.format('{0:.3f}', t)+",pose,"+str(pose)+"\n")
prevPhoneme = "na"
strings = [""]*5
POSE_COUNT = 5
emotions = {}
emotions["explain"] = 0
emotions["happy"] = 1
emotions["sad"] = 2
emotions["angry"] = 3
emotions["confused"] = 4
emotions["rq"] = 5
mouthList = [["aa","a"],["ae","a"],["ah","a"],["ao","a"],["aw","au"],
["ay","ay"],["b","m"],["ch","t"],["d","t"],["dh","t"],
["eh","a"],["er","u"],["ey","ay"],["f","f"],["g","t"],
["hh","y"],["ih","a"],["iy","ay"],["jh","t"],["k","t"],
["l","y"],["m","m"],["n","t"],["ng","t"],["ow","au"],
["oy","ua"],["p","m"],["r","u"],["s","t"],["sh","t"],
["t","t"],["th","t"],["uh","u"],["uw","u"],["v","f"],
["w","u"],["y","y"],["z","t"],["zh","t"],
["oov","m"]] # For unknown phonemes, the stick figure will just have a closed mouth ("mmm")
mouths = {}
for x in mouthList:
mouths[x[0]] = x[1]
ENDING_PHONEME = "m"
STOPPERS = [",",";",".",":","!","?"]
parser = argparse.ArgumentParser(description='blah')
parser.add_argument('--input_file', type=str, help='the script')
args = parser.parse_args()
INPUT_FILE = args.input_file
f = open(INPUT_FILE+".txt","r+")
originalScript = f.read()
f.close()
f = open(INPUT_FILE+".json","r+")
fileData = f.read()
f.close()
data = json.loads(fileData)
WORD_COUNT = len(data['words'])
pose = -1
prevPose = -1
prevPhoneme = "na"
emotion = "0"
pararaph = 0
image = 0
OS_IndexAt = 0
pickNewPose(0)
strings[1] += "0,emotion,0\n"
strings[0] += "0,paragraph,0\n"
strings[2] += "0,image,0\n"
strings[4] += "0,phoneme,m\n"
for i in range(WORD_COUNT):
word = data['words'][i]
if "start" not in word:
continue
wordString = word["word"]
timeStart = word["start"]
OS_nextIndex = originalScript.index(wordString,OS_IndexAt)+len(wordString)
if "<" in originalScript[OS_IndexAt:]:
tagStart = originalScript.index("<",OS_IndexAt)
tagEnd = originalScript.index(">",OS_IndexAt)
if OS_nextIndex > tagStart and tagEnd >= OS_nextIndex:
OS_nextIndex = originalScript.index(wordString,tagEnd)+len(wordString)
nextDigest = originalScript[OS_IndexAt:OS_nextIndex]
if "\n" in nextDigest and data['words'][i-1]['case'] != 'not-found-in-audio' and (prevPhoneme == "a" or prevPhoneme == "f" or prevPhoneme == "u" or prevPhoneme == "y"):
addPhoneme("m", data['words'][i-1]["end"])
"""print(wordString)
print(str(OS_IndexAt)+", "+str(OS_nextIndex))
print(nextDigest)
print("")"""
pickedPose = False
for stopper in STOPPERS:
if stopper in nextDigest:
pickNewPose(timeStart)
pickedPose = True
if "<" in nextDigest:
leftIndex = nextDigest.index("<")+1
rightIndex = nextDigest.index(">")
emotion = emotions[nextDigest[leftIndex:rightIndex]]
strings[1] += (str.format('{0:.3f}', timeStart)+",emotion,"+str(emotion)+"\n")
prevPhoneme = "na"
if "\n\n" in nextDigest:
pararaph += 1
image += 1 # The line of the script advances 2 lines whenever we hit a /n/n.
strings[0] += (str.format('{0:.3f}', timeStart)+",paragraph,"+str(pararaph)+"\n")
prevPhoneme = "na"
if "\n" in nextDigest:
image += 1
strings[2] += (str.format('{0:.3f}', timeStart)+",image,"+str(image)+"\n")
prevPhoneme = "na"
if not pickedPose:
pickNewPose(timeStart) # A new image means we also need to have a new pose
phones = word["phones"]
timeAt = timeStart
for phone in phones:
timeAt += phone["duration"]
phoneString = phone["phone"]
if phoneString == "sil":
truePhone = "m"
else:
truePhone = mouths[phoneString[:phoneString.index("_")]]
if len(truePhone) == 2:
addPhoneme(truePhone[0], timeAt-phone["duration"])
addPhoneme(truePhone[1], timeAt-phone["duration"]*0.5)
else:
addPhoneme(truePhone, timeAt-phone["duration"])
OS_IndexAt = OS_nextIndex
f = open(INPUT_FILE+"_schedule.csv","w+")
for i in range(len(strings)):
f.write(strings[i])
if i < len(strings)-1:
f.write("SECTION\n")
f.flush()
f.close()
print(f"Done creating schedule for {INPUT_FILE}.")
The
ValueError: substring not found
occurs when you try to find the index of a substring in a string which does not contain it in the specified (or default) section, using the index function.
The index method takes 3 parameters:
value
start
end
and it searches for the value between start and end.
So, the error occurred because the substring was not found in the section where it was searched for. The line of
OS_nextIndex = originalScript.index(wordString,tagEnd)+len(wordString)
searches for wordString, starting from tagEnd and searches for the likes of
<span>yourwordstring</span>
, but in your case it was not found. You can do one of the following to solve the issue:
you can fix your input if it should always have a match for the search
you can handle the error when the index throws the error
you can use find instead, see https://bobbyhadz.com/blog/python-valueerror-substring-not-found
Note that find also has three parameters, as you can read from https://www.w3schools.com/python/ref_string_find.asp

How to check if a string contains a specific character or not in python

I am new to python, but fairly experienced in programming. While learning python I was trying to create a simple function that would read words in from a text file (each line in the text file is a new word) and then check if the each word has the letter 'e' or not. The program should then count the amount of words that don't have the letter 'e' and use that amount to calculate the percentage of words that don't have an 'e' in the text file.
I am running into a problem where I'm very certain that my code is right, but after testing the output it is wrong. Please help!
Here is the code:
def has_n_e(w):
hasE = False
for c in w:
if c == 'e':
hasE = True
return hasE
f = open("crossword.txt","r")
count = 0
for x in f:
word = f.readline()
res = has_n_e(word)
if res == False:
count = count + 1
iAns = (count/113809)*100 //113809 is the amount of words in the text file
print (count)
rAns = round(iAns,2)
sAns = str(rAns)
fAns = sAns + "%"
print(fAns)
Here is the code after doing some changes that may help:
def has_n_e(w):
hasE = False
for c in w:
if c == 'e':
hasE = True
return hasE
f = open("crossword.txt","r").readlines()
count = 0
for x in f:
word = x[:-1]
res = has_n_e(word)# you can use ('e' in word) instead of the function
if res == False:
count = count + 1
iAns = (count/len(f))*100 //len(f) #is the amount of words in the text file
print (count)
rAns = round(iAns,2)
sAns = str(rAns)
fAns = sAns + "%"
print(fAns)
Hope this will help

Python: Rosalind Consensus and Profile

I am trying to solve the "Consensus and Profile" challenge on Rosalind.
The challenge instructions are as follows:
Given: A collection of at most 10 DNA strings of equal length (at most 1 kbp) in FASTA format.
Return: A consensus string and profile matrix for the collection. (If several possible consensus strings exist, then you may return any one of them.)
My code is as follows (I got most of it from another user on this website). My only issue is that some of the DNA strands are broken down into multiple separate lines, so they are being appended to the "allstrings" list as separate strings. I am trying to figure out how to write each consecutive line that does not contain ">" as a single string.
import numpy as np
seq = []
allstrings = []
temp_seq = []
matrix = []
C = []
G = []
T = []
A = []
P = []
consensus = []
position = 1
file = open("C:/Users/knigh/Documents/rosalind_cons (3).txt", "r")
conout = open("C:/Users/knigh/Documents/consensus.txt", "w")
# Right now, this is reading and writing each as an individual line. Thus, it
# is splitting each sequence into multiple small sequences. You need to figure
# out how to read this in FASTA format to prevent this from occurring
desc = file.readlines()
for line in desc:
allstrings.append(line)
for string in range(1, len(allstrings)):
if ">" not in allstrings[string]:
temp_seq.append(allstrings[string])
else:
seq.insert(position, temp_seq[0])
temp_seq = []
position += 1
# This last insertion into the sequence must be performed after the loop to empty
# out the last remaining string from temp_seq
seq.insert(position, temp_seq[0])
for base in seq:
matrix.append([pos for pos in base])
M = np.array(matrix).reshape(len(seq), len(seq[0]))
for base in range(len(seq[0])):
A_count = 0
C_count = 0
G_count = 0
T_count = 0
for pos in M[:, base]:
if pos == "A":
A_count += 1
elif pos == "C":
C_count += 1
elif pos == "G":
G_count += 1
elif pos == "T":
T_count += 1
A.append(A_count)
C.append(C_count)
G.append(G_count)
T.append(T_count)
profile_matrix = {"A": A, "C": C, "G": G, "T": T}
P.append(A)
P.append(C)
P.append(G)
P.append(T)
profile = np.array(P).reshape(4, len(A))
for pos in range(len(A)):
if max(profile[:, pos]) == profile[0, pos]:
consensus.append("A")
elif max(profile[:, pos]) == profile[1, pos]:
consensus.append("C")
elif max(profile[:, pos]) == profile[2, pos]:
consensus.append("G")
elif max(profile[:, pos]) == profile[3, pos]:
consensus.append("T")
conout.write("".join(consensus) + "\n")
for k, v in profile_matrix.items():
conout.write(k + ": " + " ".join(str(x) for x in v) + "\n")
conout.close()
There are a couple of ways that you can iterate a FASTA file as records. You can use a prebuilt library or write your own.
A widely used library for working with sequence data is biopython. This code snippet will create a list of strings.
from Bio import SeqIO
file = "path/to/your/file.fa"
sequences = []
with open(file, "r") as file_handle:
for record in SeqIO.parse(file_handle, "fasta"):
sequences.append(record.seq)
Alternatively, you can write your own FASTA parser. Something like this should work:
def read_fasta(fh):
# Iterate to get first FASTA header
for line in fh:
if line.startswith(">"):
name = line[1:].strip()
break
# This list will hold the sequence lines
fa_lines = []
# Now iterate to find the get multiline fasta
for line in fh:
if line.startswith(">"):
# When in this block we have reached
#  the next FASTA record
# yield the previous record's name and
# sequence as tuple that we can unpack
yield name, "".join(fa_lines)
# Reset the sequence lines and save the
#  name of the next record
fa_lines = []
name = line[1:].strip()
# skip to next line
continue
fa_lines.append(line.strip())
yield name, "".join(fa_lines)
You can use this function like so:
file = "path/to/your/file.fa"
sequences = []
with open(file, "r") as file_handle:
for name, seq in read_fasta(file_handle):
sequences.append(seq)

Can I use bisect to print the content of a line?

I have a file where each line is ordered alphabetically. The file is 12Gb, which means I can't simply read it line by line. The data looks like this:
brown 0 1 0 1 2
fox 3 5 0 0 1
jumped 2 0 6 1 0
The words at the beginning of each line are unique. The word and the numbers on each line are separated by tabs. I want to be able to query the file for specific keywords. For example, if I query "fox", the program should return "fox 3 5 0 0 1".
It seems that a good candidate for this would be the bisect module: https://docs.python.org/3.0/library/bisect.html
I found a post which uses bisect to find out the line number of a keyword: How do I perform binary search on a text file to search a keyword in python?
This is what the code looks like:
import bisect
import os
class Query(object):
def __init__(self, query, index=5):
self.query = query
self.index = index
def __lt__(self, comparable):
return self.query < comparable[self.index:]
class FileSearcher(object):
def __init__(self, file_pointer, record_size=35):
self.file_pointer = file_pointer
self.file_pointer.seek(0, os.SEEK_END)
self.record_size = record_size + len(os.linesep)
self.num_bytes = self.file_pointer.tell()
self.file_size = (self.num_bytes // self.record_size)
def __len__(self):
return self.file_size
def __getitem__(self, item):
self.file_pointer.seek(item * self.record_size)
return self.file_pointer.read(self.record_size)
with open('myfile') as file_to_search:
query = 'fox\t' #token to query
wrapped_query = Query(query)
searchable_file = FileSearcher(file_to_search)
linepos = bisect.bisect(searchable_file, wrapped_query)
print "Located # line: ", linepos
#print content of line?
However, I can't figure out how to actually print the content of the line. I should at least add a read statement somewhere, but I don't know where.
Is it possible to print the content of the line with the bisect module?
If you want go with Python solution, you can do the following:
Read file by small chunks of MAX_LINE bytes, each time moving forward by fixed offset
That offset determines block size
For each such read, determine the key (first word in a line)
These keys serve as delimiters of blocks
Construct the list of such keys. The list would be sorted as keys are ordered
You may persist such list somewhere via pickle/json.dumps/...
When quering, find via bisect the index of a block where you key is located
Read that block entirely and find the key with data
Here is the example file bigfile:
abc 4
bar 2
baz 3
egg 6
foo 1
god 8
ham 5
sex 7
The code:
import os
from bisect import bisect
MAX_LINE = 7
BLOCK_SIZE = 10
def parse_chunks(filename):
size = os.path.getsize(filename)
chunks = []
with open(filename, 'rb') as file:
block = str(file.read(MAX_LINE*2))
first_line = block[:block.find('\n') + 1]
chunks.append(first_line.split()[0])
pos = BLOCK_SIZE
while pos < size:
file.seek(pos)
block = str(file.read(MAX_LINE*2))
first_eol = block.find('\n')
second_eol = block.find('\n', first_eol + 1)
if first_eol == -1 or second_eol == -1:
break
line = block[first_eol + 1:second_eol]
key = line.split()[0]
chunks.append(key)
pos += BLOCK_SIZE
return chunks
if __name__ == '__main__':
BLOCK_SIZE = 10
filename = 'bigfile'
chunks = parse_chunks(filename)
query = 'abc'
pos_before = bisect(chunks, query) - 1
with open(filename, 'rb') as file:
file.seek(pos_before*BLOCK_SIZE)
block = str(file.read(BLOCK_SIZE + MAX_LINE))
line_start = block.find(query)
line_end = block.find('\n', line_start + 1)
line = block[line_start:line_end]
print(line)
In this toy example I use block size of 10 bytes, in your case of 12GB file I'd suggest you to start with 1M.
The following recursive function should be able to narrow the search interval. I'm not sure that you can modify it so that it returns a match or None for no match.
def bisearch(f, word, i, j)
if (j-1)<1E6: return i,j
k = (i+j)/2
f.seek(k)
while k<j:
c = f.read(1)
k = k+1
if c == '\n': break
else:
# ??? no match ??? I'm not sure
w = []
while 1:
c = f.read(1)
if c == '\t': break
w.append(c)
w = "".join(w)
if w == word:
return k, k
if w < word:
return bisearch(f, word, k, j)
else:
return bisearch(f, word, i, k)
and here an example of usage
word = ...
f = open(...)
i,j = bisearch(f, word, 0, len_f)
f.seek(i)
if i==j:
line = f.readline()
else:
#################### EDIT ################
# OLD
# buffer = f.read(1E6)
# NEW
buffer = f.read(j-i)
lenw = len(word)
for line in buffer.split('\n'):
if line[:lenw] == word: break
else:
# no matches, SOS
result = process(line)
Try seeking to the line in question and using readline.
print "Located # line: ", linepos
file_to_search.seek(linepos)
line = file_to_search.readline()
This is assuming linepos is the position of the line, counted in bytes from the beginning of the file. If it's the position counted in line numbers, you'll need to multiply by the number of bytes per line before seeking.
print "Located # line: ", linepos
file_to_search.seek(linepos * searchable_file.record_size)
line = file_to_search.readline()

python to search within pdf file

here is part of pdf structure:
5 0 obj
<< /Length 56 >>
stream
BT /F1 12 Tf 100 700 Td 15 TL (JavaScript example) Tj ET
endstream
endobj
6 0 obj
<<
/Type /Font
/Subtype /Type1
/Name /F1
/BaseFont /Helvetica
/Encoding /MacRomanEncoding
>>
endobj
7 0 obj
<<
/Type /Action
/S /JavaScript
I want to search for "javascript" if its there or not. the problem with it that javascript can be represented by its hex as a whole or part ot it "javascript or Jav#61Script or J#61v#61Script and so on"
so how could I find out if javascript is exist with all of this possibilities ????
Read it in a character at a time and translate any hex you find to characters as you go, also translating to lowercase. Compare the result to "javascript".
Here's an idea:
import string
import os
import re
def pdf_find_str(pdfname, str):
f = open(pdfname, "rb")
# read the file CHUNK_SIZE chars at a time, keeping last KEEP_SIZE chars
CHUNK_SIZE = 2*1024*1024
KEEP_SIZE = 3 * len(str) # each char might be in #ff form
hexvals = "0123456789abcdef"
ichunk = removed = 0
chunk = f.read(CHUNK_SIZE)
while len(chunk) > 0:
# Loop to find all #'s and replace them with the character they represent.
hpos = chunk.find('#')
while hpos != -1:
if len(chunk)-hpos >= 3 and chunk[hpos+1] in hexvals and chunk[hpos+2] in hexvals:
hex = int(chunk[hpos+1:hpos+3], 16) # next two characters are int value
ch = chr(hex).lower()
if ch in str: # avoid doing this if ch is not in str
chunk = chunk[:hpos] + ch + chunk[hpos+3:]
removed += 2
hpos = chunk.find('#', hpos+1)
m = re.search(str, chunk, re.I)
if m:
return ichunk * (CHUNK_SIZE-KEEP_SIZE) + m.start()
# Transfer last KEEP_SIZE characters to beginning for next round of
# testing since our string may span chunks.
next_chunk = f.read(CHUNK_SIZE - KEEP_SIZE)
if len(next_chunk) == 0: break
chunk = chunk[-KEEP_SIZE:] + next_chunk
ichunk += 1
f.close()
return -1
# On one file:
#if pdf_find_str("Consciousness Explained.pdf", "javascript") != -1:
# print 'Contains "javascript"'
# Recursively on a directory:
for root, dirs, files in os.walk("Books"):
for file in files:
if file.endswith(".pdf"):
position = pdf_find_str(root + "/" + file, "javascript")
if position != -1:
print file, "(", position, ")"
# Note: position returned by pdf_find_str does not account for removed
# characters from #ff representations (if any).

Categories

Resources