python to search within pdf file

python to search within pdf file - python

here is part of pdf structure:
5 0 obj
<< /Length 56 >>
stream
BT /F1 12 Tf 100 700 Td 15 TL (JavaScript example) Tj ET
endstream
endobj
6 0 obj
<<
/Type /Font
/Subtype /Type1
/Name /F1
/BaseFont /Helvetica
/Encoding /MacRomanEncoding
>>
endobj
7 0 obj
<<
/Type /Action
/S /JavaScript
I want to search for "javascript" if its there or not. the problem with it that javascript can be represented by its hex as a whole or part ot it "javascript or Jav#61Script or J#61v#61Script and so on"
so how could I find out if javascript is exist with all of this possibilities ????

Read it in a character at a time and translate any hex you find to characters as you go, also translating to lowercase. Compare the result to "javascript".
Here's an idea:
import string
import os
import re
def pdf_find_str(pdfname, str):
f = open(pdfname, "rb")
# read the file CHUNK_SIZE chars at a time, keeping last KEEP_SIZE chars
CHUNK_SIZE = 2*1024*1024
KEEP_SIZE = 3 * len(str) # each char might be in #ff form
hexvals = "0123456789abcdef"
ichunk = removed = 0
chunk = f.read(CHUNK_SIZE)
while len(chunk) > 0:
# Loop to find all #'s and replace them with the character they represent.
hpos = chunk.find('#')
while hpos != -1:
if len(chunk)-hpos >= 3 and chunk[hpos+1] in hexvals and chunk[hpos+2] in hexvals:
hex = int(chunk[hpos+1:hpos+3], 16) # next two characters are int value
ch = chr(hex).lower()
if ch in str: # avoid doing this if ch is not in str
chunk = chunk[:hpos] + ch + chunk[hpos+3:]
removed += 2
hpos = chunk.find('#', hpos+1)
m = re.search(str, chunk, re.I)
if m:
return ichunk * (CHUNK_SIZE-KEEP_SIZE) + m.start()
# Transfer last KEEP_SIZE characters to beginning for next round of
# testing since our string may span chunks.
next_chunk = f.read(CHUNK_SIZE - KEEP_SIZE)
if len(next_chunk) == 0: break
chunk = chunk[-KEEP_SIZE:] + next_chunk
ichunk += 1
f.close()
return -1
# On one file:
#if pdf_find_str("Consciousness Explained.pdf", "javascript") != -1:
# print 'Contains "javascript"'
# Recursively on a directory:
for root, dirs, files in os.walk("Books"):
for file in files:
if file.endswith(".pdf"):
position = pdf_find_str(root + "/" + file, "javascript")
if position != -1:
print file, "(", position, ")"
# Note: position returned by pdf_find_str does not account for removed
# characters from #ff representations (if any).

Related

How to test unicode code points for valid LTR use in Python?

This question is relative to this one. But as my tried solution does not work, I open a new question to deal with my specific problems.
Context:
In the application I develop, I need to build python regex that includes unicodes, possibly in the whole range(0, 0x110000). When I build my regex, for example with the following:
regex += mycodepoint_as_char + ".{0," + str(max_repeat) + "}"
I observes that for some code points, the order is reversed as if I had written:
regex += "{0," + str(max_repeat) + "}." + mycodepoint_as_char
regex = ή.{0,2}{0,3}.䝆⚭.{0,3}俩.{0,4}ⷭ
In other cases, I have an exception.
So I studied the norm for biderectional unicode and some Q/A that explain surrogate pairs, Left-To-Right and Right-To-Left special code points, and some prohibited ones reserved for UTF-16.
My problem:
Then I have decided to test all of them, and to build a list of RTL ones and prohibited ones, assuming the first would change the order in the string, and that the last would raise an exception.
Here is my test code:
#!/usr/bin/python3
import sys
import os
import unicodedata #https://docs.python.org/fr/3/library/unicodedata.html, https://fr.wikipedia.org/wiki/Normalisation_Unicode
def group_consecutive(l):
res = []
i1 = 0
i2 = 0
while i1 < len(l):
while i2 + 1 < len(l) and l[i2+1] == l[i2] + 1:
i2 += 1
res.append((i1, i2+1)) # range(i1, i2+1) has consecutive values
i1 = i2+1
i2 = i1
return res
def id_rtl_code_points():
oldstdout = sys.stdout # https://stackoverflow.com/questions/8777152/unable-to-restore-stdout-to-original-only-to-terminal
nullstdout = open(os.devnull, 'w') # https://stackoverflow.com/questions/26837247/how-to-disable-print-statements-conveniently-so-that-pythonw-can-run?noredirect=1&lq=1
forbiddenCP = []
sep = 'a' # choose a letter that can receive modifiers
s = ""
for i in range(0, 0x110000):
if i%0x10000 == 0:
print(hex(i) + "-------------") # show progress
try:
if len(s) % 2 == 1: #keep synchronised, sep on modulo = 0, chr(i) on modulo = 1
s += sep
#sys.stdout = nullstdout
print(hex(i), " : " + sep + chr(i)) # without print, no error
except:
forbiddenCP.append(i)
else:
s += sep + chr(i)
finally:
pass
#sys.stdout = oldstdout
s += sep
rtlCP = []
for i in range(0, 0x110000,2):
if s[i] != sep: #not sure at all this algorythm is right
rtlCP.append(ord(s[i]))
sys.stdout = oldstdout
#print("id_rtl_code_points - s = ", s)
print("rtlCP = ", group_consecutive(rtlCP))
print("rtlCP% = ", round(float(len(rtlCP))/0x110000*100, 2), "%")
print("forbiddenCP = ", group_consecutive(forbiddenCP))
print("forbiddenCP% = ", round(float(len(forbiddenCP))/0x110000*100, 2), "%")
def main():
id_rtl_code_points()
if __name__ == '__main__':
main()
Run as it is, I get (I skip parts with dots):
$ ./test.py
0x0-------------
0x0 : a
0x1 : a
0x2 : a
....................
0x21 : a!
0x22 : a"
0x23 : a#
0x24 : a$
....................
0x60 : a`
0x61 : aa
0x62 : ab
0x63 : ac
0x64 : ad
....................
0x98 : a
0x9a : a
0x9b : a
9c : a
0x9d : a$ 1;1;120;120;1;0x
Not so good, I don't understand why it stops displaying.
If I forward stdout to /dev/null for the exception test (uncomment lines 33 and 41), I get:
$ ./test.py
0x0-------------
0x10000-------------
0x20000-------------
0x30000-------------
0x40000-------------
0x50000-------------
0x60000-------------
0x70000-------------
0x80000-------------
0x90000-------------
0xa0000-------------
0xb0000-------------
0xc0000-------------
0xd0000-------------
0xe0000-------------
0xf0000-------------
0x100000-------------
rtlCP = []
rtlCP% = 0.0 %
forbiddenCP = [(0, 2048)]
forbiddenCP% = 0.18 %
The first 2048 code points would raise exception ? This is a silly result, of course not. I would have expected problems in the range U+D800 and U+DFFF.
Is my approach correct, then what do I miss, or is it non sense, then why?

How to get textwrap in Python to break on underscores or arbitrary characters (not just hyphens and whitespace)

The following works well:
from textwrap import fill
print(fill("hello-there", 8))
Outputs:
hello-
there
However, I am using a lot of text where words are separated with underscores, not hyphens. The option break_on_hyphens is great but there seems to be no way to specify other separators.
I looked around and was really surprised to not find anything on this. Does anyone have any idea of the best way to proceed?

So the quick-and-dirty way I found was to simply write a wrapper around the function that replaces the desired character with a hyphen to make that function work. It's not very Pythonic, but would get the job done. I'm hoping someone else can come up with the "actual" answer (if it exists...).
Note that I wrote a "fill" version as I am looking for a string, not a list.
Python 3:
from textwrap import wrap
def fill_custom_sep(source_text, separator_char, width=70, **kwargs):
chunk_start_index = 0
replaced_with_hyphens = source_text.replace(separator_char, '-')
returned = ""
for chunk in wrap(replaced_with_hyphens, width, **kwargs):
if len(returned):
returned += '\n' # Todo: Modifiable
chunk_length = len(chunk)
if chunk[-1] == "-" and source_text[chunk_start_index + (chunk_length - 1)] == separator_char:
chunk = chunk[:-1] + separator_char
returned += chunk
chunk_start_index += chunk_length
return returned
print(fill_custom_sep("hello_there", "_", 8))
Outputs:
hello_
there

While we're at it, here is a simple function to wrap around a list of separators (not just one):
from textwrap import wrap
def wrap_custom(source_text, separator_chars, width=70, keep_separators=True):
current_length = 0
latest_separator = -1
current_chunk_start = 0
output = ""
char_index = 0
while char_index < len(source_text):
if source_text[char_index] in separator_chars:
latest_separator = char_index
output += source_text[char_index]
current_length += 1
if current_length == width:
if latest_separator >= current_chunk_start:
# Valid earlier separator, cut there
cutting_length = char_index - latest_separator
if not keep_separators:
cutting_length += 1
if cutting_length:
output = output[:-cutting_length]
output += "\n"
current_chunk_start = latest_separator + 1
char_index = current_chunk_start
else:
# No separator found, hard cut
output += "\n"
current_chunk_start = char_index + 1
latest_separator = current_chunk_start - 1
char_index += 1
current_length = 0
else:
char_index += 1
return output
wrapped = wrap_custom("Split This-This_Text", [" ","_","-"], 7, False)

How to find byte sequence in file?

I have a binary file, in which I need to change certain bit.
That bit's byte's address is relative to some byte sequence (some ASCII string):
content = array('B')
with open(filename, mode="r+b") as file:
content.fromfile(file, os.fstat(file.fileno()).st_size)
abc = [ord(letter) for letter in "ABC"]
i = content.index(abc) // ValueError: array.index(x): x not in list
content[i + 0x16] |= 1
content.tofile(file)
However as I must confess to my shame, that after Googling far and wide, I couldn't find the method to get the index of that "ABC" string...
Sure, I can write a function that does it with loops, but I can't believe there is no one-liner (OK, even two...) that accomplishes it.
How can it be done?

Not sure if this is the most Pythonic way, but this works. In this file
$ cat so.bin
���ABC̻�X��w
$ hexdump so.bin
0000000 eeff 41dd 4342 bbcc 58aa 8899 0a77
000000e
Edit: New solution starts here.
import string
char_ints = [ord(c) for c in string.ascii_letters]
with open("so.out.bin", "wb") as fo:
with open("so.bin", "rb") as fi:
# Read bytes but only keep letters.
chars = []
for b in fi.read():
if b in char_ints:
chars.append(chr(b))
else:
chars.append(" ")
# Search for 'ABC' in the read letters.
pos = "".join(chars).index("ABC")
# We now know the position of the intersting byte.
pos_x = pos + len("ABC") + 3 # known offset
# Now copy all bytes from the input to the output, ...
fi.seek(0)
i = 0
for b in fi.read():
# ... but replace the intersting byte.
if i == pos_x:
fo.write(b"Y")
else:
fo.write(bytes([b]))
i = i + 1
Edit: New solution ends here.
I want to get the X four positions after ABC. A little state keeping locates the position of ABC, skips the offset, prints the interesting bytes.
foundA = False
foundB = False
foundC = False
found = False
offsetAfterC = 3
lengthAfterC = 1
with open("so.bin", "rb") as f:
pos = 0
for b in f.read():
pos = pos + 1
if not found:
if b == 0x41:
foundA = True
elif foundA and b == 0x42:
foundB = True
elif foundA and foundB and b == 0x43:
foundC = True
else:
foundA, foundB, foundC = False, False, False
if foundA and foundB and foundC:
found = True
break
f.seek(0)
i = 0
while i < pos + offsetAfterC:
b = f.read(1)
i = i + 1
while i < pos + offsetAfterC + lengthAfterC:
b = f.read(1)
print(hex(int.from_bytes(b, byteorder="big")))
i = i + 1
Output:
0x58

How would I find the nearest space per every 2000 characters and add everything before it to a variable? (Python)

I'm currently writing a program that posts messages with a 2,000 character limit. The string I'm using is usually around 10,000 characters. However, the method I'm using to divide it up can divide the string up in the middle of a word - how would I only have it divide at the nearest space (BEFORE) 2k characters?
Here's the current code:
text = str(post.selftext)
title = await client.send_message(message.author, str(post.title))
if len(text) > 1990:
amountsplit = math.ceil(len(text) / 1990)
atatime = math.floor(len(text) / amountsplit)
donetimes = 0
lastone = 0
for i in range(amountsplit):
todonow = int(donetimes + 1) * atatime
tmp = await client.send_message(message.author, str(text[lastone:todonow]))
lastone = todonow
donetimes += 1

U can use rfind() method of str:
from __future__ import print_function
def split_message(message, character_limit=2000):
messages = []
while len(message) > character_limit:
split_index = message[:character_limit].rfind(" ")
if split_index == -1:
# No space found, just split at the character limit
split_index = character_limit
else:
# Else space is found, split after the space
split_index += 1
messages.append(message[:split_index])
message = message[split_index:]
messages.append(message)
return messages
# Test code
test_string = "this is a test string to see if this works right"
test_string = "thisisateststringwithnospaces"
for character_limit in range(1, 10):
print ("limit", character_limit, ": ", split_message(test_string, character_limit))

you can take the characters and read it from the end (with a decremental loop for until find space) It should look like :
for i in range(len(text)-1, 0, -1):
if text[i]==' ':
break
print "found the last space !"
EDIT
VeryLongText="the very long text..[..]....here"
text = VeryLongText[0:1999]
for i in range(len(text)-1, 0, -1):
if text[i]==' ':
Var=text[0:i]
break
print Var
Var will be your text (the first 2000 characters of the long text) until the last space

Use a generator.
def messages_from_post(post, limit=2000):
while len(post) > limit:
try:
i = post.rindex(' ', 0, limit + 1)
except ValueError:
i = limit
message, post = map(str.lstrip, [post[:i], post[i:]])
yield message
yield post

Can I use bisect to print the content of a line?

I have a file where each line is ordered alphabetically. The file is 12Gb, which means I can't simply read it line by line. The data looks like this:
brown 0 1 0 1 2
fox 3 5 0 0 1
jumped 2 0 6 1 0
The words at the beginning of each line are unique. The word and the numbers on each line are separated by tabs. I want to be able to query the file for specific keywords. For example, if I query "fox", the program should return "fox 3 5 0 0 1".
It seems that a good candidate for this would be the bisect module: https://docs.python.org/3.0/library/bisect.html
I found a post which uses bisect to find out the line number of a keyword: How do I perform binary search on a text file to search a keyword in python?
This is what the code looks like:
import bisect
import os
class Query(object):
def __init__(self, query, index=5):
self.query = query
self.index = index
def __lt__(self, comparable):
return self.query < comparable[self.index:]
class FileSearcher(object):
def __init__(self, file_pointer, record_size=35):
self.file_pointer = file_pointer
self.file_pointer.seek(0, os.SEEK_END)
self.record_size = record_size + len(os.linesep)
self.num_bytes = self.file_pointer.tell()
self.file_size = (self.num_bytes // self.record_size)
def __len__(self):
return self.file_size
def __getitem__(self, item):
self.file_pointer.seek(item * self.record_size)
return self.file_pointer.read(self.record_size)
with open('myfile') as file_to_search:
query = 'fox\t' #token to query
wrapped_query = Query(query)
searchable_file = FileSearcher(file_to_search)
linepos = bisect.bisect(searchable_file, wrapped_query)
print "Located # line: ", linepos
#print content of line?
However, I can't figure out how to actually print the content of the line. I should at least add a read statement somewhere, but I don't know where.
Is it possible to print the content of the line with the bisect module?

If you want go with Python solution, you can do the following:
Read file by small chunks of MAX_LINE bytes, each time moving forward by fixed offset
That offset determines block size
For each such read, determine the key (first word in a line)
These keys serve as delimiters of blocks
Construct the list of such keys. The list would be sorted as keys are ordered
You may persist such list somewhere via pickle/json.dumps/...
When quering, find via bisect the index of a block where you key is located
Read that block entirely and find the key with data
Here is the example file bigfile:
abc 4
bar 2
baz 3
egg 6
foo 1
god 8
ham 5
sex 7
The code:
import os
from bisect import bisect
MAX_LINE = 7
BLOCK_SIZE = 10
def parse_chunks(filename):
size = os.path.getsize(filename)
chunks = []
with open(filename, 'rb') as file:
block = str(file.read(MAX_LINE*2))
first_line = block[:block.find('\n') + 1]
chunks.append(first_line.split()[0])
pos = BLOCK_SIZE
while pos < size:
file.seek(pos)
block = str(file.read(MAX_LINE*2))
first_eol = block.find('\n')
second_eol = block.find('\n', first_eol + 1)
if first_eol == -1 or second_eol == -1:
break
line = block[first_eol + 1:second_eol]
key = line.split()[0]
chunks.append(key)
pos += BLOCK_SIZE
return chunks
if __name__ == '__main__':
BLOCK_SIZE = 10
filename = 'bigfile'
chunks = parse_chunks(filename)
query = 'abc'
pos_before = bisect(chunks, query) - 1
with open(filename, 'rb') as file:
file.seek(pos_before*BLOCK_SIZE)
block = str(file.read(BLOCK_SIZE + MAX_LINE))
line_start = block.find(query)
line_end = block.find('\n', line_start + 1)
line = block[line_start:line_end]
print(line)
In this toy example I use block size of 10 bytes, in your case of 12GB file I'd suggest you to start with 1M.

The following recursive function should be able to narrow the search interval. I'm not sure that you can modify it so that it returns a match or None for no match.
def bisearch(f, word, i, j)
if (j-1)<1E6: return i,j
k = (i+j)/2
f.seek(k)
while k<j:
c = f.read(1)
k = k+1
if c == '\n': break
else:
# ??? no match ??? I'm not sure
w = []
while 1:
c = f.read(1)
if c == '\t': break
w.append(c)
w = "".join(w)
if w == word:
return k, k
if w < word:
return bisearch(f, word, k, j)
else:
return bisearch(f, word, i, k)
and here an example of usage
word = ...
f = open(...)
i,j = bisearch(f, word, 0, len_f)
f.seek(i)
if i==j:
line = f.readline()
else:
#################### EDIT ################
# OLD
# buffer = f.read(1E6)
# NEW
buffer = f.read(j-i)
lenw = len(word)
for line in buffer.split('\n'):
if line[:lenw] == word: break
else:
# no matches, SOS
result = process(line)

Try seeking to the line in question and using readline.
print "Located # line: ", linepos
file_to_search.seek(linepos)
line = file_to_search.readline()
This is assuming linepos is the position of the line, counted in bytes from the beginning of the file. If it's the position counted in line numbers, you'll need to multiply by the number of bytes per line before seeking.
print "Located # line: ", linepos
file_to_search.seek(linepos * searchable_file.record_size)
line = file_to_search.readline()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

python to search within pdf file - python

Related

How to test unicode code points for valid LTR use in Python?

How to get textwrap in Python to break on underscores or arbitrary characters (not just hyphens and whitespace)

How to find byte sequence in file?

How would I find the nearest space per every 2000 characters and add everything before it to a variable? (Python)

Can I use bisect to print the content of a line?

Categories

Resources