Reverse string translation with dictionary - python

I am trying to reverse the use of the translate function. I pass a dictionary into str.maketrans, which translates the original string correctly, as per the dictionary.
cipher_dictionary = {'a': 'h5c', 'b': 'km3', 'c': '5fv'}
def cipher(text):
trans = str.maketrans(cipher_dictionary)
return text.translate(trans)
Above is the sample dictionary, together with the function that I use to translate strings. Translating abc gives me h5ckm35fv, which is desired.
Now, to reverse it, I am trying to use the following function.
def decipher(text):
reverse = {value: key for key, value in cipher_dictionary.items()}
trans = str.maketrans(reverse)
return text.translate(trans)
Using it raises an error.
Traceback (most recent call last):
File "C:\Users\lukas\Desktop\cipher.py", line 21, in <module>
deciphered = decipher(ciphered)
File "C:\Users\lukas\Desktop\cipher.py", line 13, in decipher
trans = str.maketrans(reverse)
ValueError: string keys in translate table must be of length 1
I am aware that this is because the values in cipher_dictionary aren't equal length to a, b and c. How can I go about rewriting the decipher function, to make h5ckm35fv translate back into abc?
cipher_dictionary = {'a': 'h5c', 'b': 'km3', 'c': '5fv'}
def cipher(text):
trans = str.maketrans(cipher_dictionary)
return text.translate(trans)
def decipher(text):
reverse = {value: key for key, value in cipher_dictionary.items()}
trans = str.maketrans(reverse)
return text.translate(trans)
if __name__ == '__main__':
text_to_cipher = 'abc'
ciphered = cipher(text_to_cipher)
print(ciphered)
deciphered = decipher(ciphered)
print(deciphered)
Running any of the functions provided in answers works perfectly, except for when there is white space in the input.
Text to cipher: some white space
Ciphered text: px3h54oa4b83 ky6u1v0t6yq3b83 px3sy9h5c5fvb83
Traceback (most recent call last):
File "C:\Users\Lukasz\Desktop\Python\Cipher\cip.py", line 45, in <module>
deciphered = decipher(ciphered)
File "C:\Users\Lukasz\Desktop\Python\Cipher\cip.py", line 36, in decipher
decoded_text = ''.join(reverse[text[i:i+3]] for i in range(0, len(text), 3))
File "C:\Users\Lukasz\Desktop\Python\Cipher\cip.py", line 36, in <genexpr>
decoded_text = ''.join(reverse[text[i:i+3]] for i in range(0, len(text), 3))
KeyError: ' ky'

def decipher(sentence):
reverse = {value: key for key, value in cipher_dictionary.items()}
decoded_text = ' '.join(''.join(reverse[word[i:i+3]] for i in range(0, len(word), 3)) for word in sentence.split(' '))
return decoded_text
Assuming that every letter is being encoded into a set of 3 letters.

Assuming that the values in the dictionary for a prefix free code, then you can keep trying prefixes of the unprocessed ciphertext until you find a match in the reverse dictionary:
def decipher(text, d):
r = {v: k for k,v in d.items()} # Reversed dictionary
plaintext = ''
index = 0
length = 1
while index + length <= len(text):
try:
plaintext += r[text[index:index+length]]
index = index + length
length = 1
except:
length += 1
return plaintext
If the values of the dictionary do not form a prefix free code, then the algorithm involves backtracking, and will return one possible plaintext if the cipher is non bijective:
def decipher2(text, d):
r = {v: k for k,v in d.items()} # Reversed dictionary
length = 1
while length <= len(text):
try:
val = r[text[:length]]
if length == len(text):
return val
else:
return val + decipher2(text[length:], d)
except:
length += 1
raise ValueError('Malformed input.')

If you know that all cipher values are of length 3 (i.e. that all values in cipher_dictionary are three characters long), then:
def decrypt(ciphertext, cipher_dict):
decipher_dict = {v:k for k,v in cipher_dict.items()}
answer = []
for cipher in (ciphertext[i:i+3] for i in range(0,len(ciphertext), 3)):
answer.append(decipher_dict[cipher])
return ''.join(answer)
On the other hand, if you don't know that all values are of length 3 (or if they are not of constant size), then try this:
def decrypt(ciphertext, cipher_dict):
decipher_dict = {v:k for k,v in cipher_dict.items()}
answer = []
start = 0
for end in range(len(ciphertext)):
if ciphertext[start:end] not in decipher_dict: continue
answer.append(decipher_dict[ciphertext[start:end]])
start = end
return ''.join(answer)
The problem with this is that it is a greedy algorithm and incurs all the shortcomings of its naïvité
UPDATE:
If you want to do this with sentences (words separated by whitespace):
encryptedSentence = '...'
answer = []
for word in sentence.split():
answer.append(decrypt(word, cipher_dict))
return ' '.join(answer)

Related

Python - I am getting an error that says 'substring not found'

I am trying to make a transposition cipher encryption function for a class project.
from string import ascii_lowercase
def swap(s: str, index0: int, index1: int):
smaller = index0 if index0 < index1 else index1
bigger = index0 if index0 >= index1 else index1
if bigger >= len(s) or smaller < 0:
return None
ret = s[:smaller] + s[bigger] + s[smaller+1:] # swap first
ret = ret[:bigger] + s[smaller] + s[bigger+1:] # swap second
return ret
def swap_encrypt(s: str, key:str):
ret = s
for key_chr in key:
index = ascii_lowercase.index(key_chr)
swap_this = index % len(ret)
with_this = (swap_this + 1) % len(ret)
ret = swap(ret, swap_this, with_this)
return ret
s = ''
key = ''
def main2():
s = input('Enter your message: ')
s = cleanup(s)
key = input('Enter your keyword: ')
key = cleanup(key)
ret= swap_encrypt((s), (key))
print(cleanup(ret))
main2()
I am getting the error 'substring not found', is there something I am doing wrong?
If my input is =(‘SLOTH POWER’) for s, (‘TOP’) for the key, my output should be: ‘RLOTPOHWES’
Is there also another to limit the functions to ord(), len(), and range()? If so, could I be shown how as well?
error:
Traceback (most recent call last):
File "c:\Users\darks\OneDrive\Documents\7\ciphers.py", line 139, in <module>
main2()
File "c:\Users\darks\OneDrive\Documents\7\ciphers.py", line 136, in main2
ret= swap_encrypt((s), (key))
File "c:\Users\darks\OneDrive\Documents\7\ciphers.py", line 123, in swap_encrypt
index = ascii_lowercase.index(key_chr)
ValueError: substring not found
It can't find the character in the ascii_lowercase, because your input is uppercase. Try "sloth power" instead of "SLOTH POWER", or use s.lower().

How to implement HMAC in python without using the hmac library?

I want to implement the hmac algorithm with SHA-1 by the definition from RFC 2104. The code is running but the results aren't the same as the test-vectors from RFC. I'm not sure if I'm loading the values correctly(String to Hex, or String to Bytes?).
As template I've used the pseudo-code from wikipedia
I'm not sure about the terms 'blocksize' and 'output size'. In the code from wikipedia the outputsize is one of the input values but never used.
This is my code so far:
First I'm setting up a hash-function, then I'm converting my input-strings (key and message) into hex values. Next step is to to look if key hast go get hashed or filled with zeros. Next I'm xor-ing the single chars from the key with those values (I don't know where they come from, but they're in every example without any comment). Last but not least I'm combining an inner string(I_key_pad + message) and hash it which results in an outer strings that im combining with the outer pad and hash it again.
import hashlib
from functools import reduce
def hmac(key, message, hashfunc):
hasher = hashlib.sha1
blocksize = 40
message = toHex(message) #is this right?
key = toHex(key)
#alternative: loading values as bytes
#message = bytes(message, 'utf-8')
#key = bytes(key, 'utf-8')
if len(key) > blocksize:
key = hasher(key)
else:
#key = key.ljust(blocksize, '0') #filling from right to left
#key = key.ljust(blocksize, b'\0') #same as above but for bytes
key = pad(key, blocksize) #filling from left to right
val1 = 0x5c
val2 = 0x36
i = 0
o_key_pad = ""
i_key_pad = ""
while i < blocksize:
o_key_pad += str(ord(key[i]) ^ val1)
i_key_pad += str(ord(key[i]) ^ val2)
i += 1
tmp_string = str(i_key_pad) + str(message)
tmp_string = tmp_string.encode()
inner_hash = hasher(tmp_string).hexdigest()
fullstring = str(o_key_pad) + inner_hash
fullstring = fullstring.encode()
fullstring = hasher(fullstring).hexdigest()
print(fullstring)
def pad(key, blocksize):
key = str(key)
while len(key) < blocksize:
key = '0' + key
key = key
return key
def toHex(s):
lst = []
for ch in s:
hv = hex(ord(ch)).replace('0x', '')
if len(hv) == 1:
hv = '0' + hv
lst.append(hv)
return reduce(lambda x, y: x + y, lst)
def main():
while (1):
key = input("key = ")
message = input("message = ")
hash = input("hash (0: SHA-256, 1: SHA-1) = ")
hmac(key, message, hash)
if __name__ == "__main__":
main()
I'm not understanding all the steps in your code, but here's a short example showing HMAC-SHA1 using only hashlib.sha1, with a helper function xor.
import hashlib
def xor(x, y):
return bytes(x[i] ^ y[i] for i in range(min(len(x), len(y))))
def hmac_sha1(key_K, data):
if len(key_K) > 64:
raise ValueError('The key must be <= 64 bytes in length')
padded_K = key_K + b'\x00' * (64 - len(key_K))
ipad = b'\x36' * 64
opad = b'\x5c' * 64
h_inner = hashlib.sha1(xor(padded_K, ipad))
h_inner.update(data)
h_outer = hashlib.sha1(xor(padded_K, opad))
h_outer.update(h_inner.digest())
return h_outer.digest()
def do_tests():
# test 1
k = b'\x0b' * 20
data = b"Hi There"
result = hmac_sha1(k, data)
print(result.hex())
# add tests as desired

Windows/Python Error WindowsError: [Error 3] The system cannot find the path specified

Hi I am new to python and i need some help. I trying to run a file on Windows 10 OS with python 2.7.
import os
import re
import codecs
import numpy as np
import theano
models_path = "./models"
eval_path = "./evaluation"
eval_temp = os.path.join(eval_path, "temp")
eval_script = os.path.join(eval_path, "conlleval")
def get_name(parameters):
"""
Generate a model name from its parameters.
"""
l = []
for k, v in parameters.items():
if type(v) is str and "/" in v:
l.append((k, v[::-1][:v[::-1].index('/')][::-1]))
else:
l.append((k, v))
name = ",".join(["%s=%s" % (k, str(v).replace(',', '')) for k, v in l])
return "".join(i for i in name if i not in "\/:*?<>|")
def set_values(name, param, pretrained):
"""
Initialize a network parameter with pretrained values.
We check that sizes are compatible.
"""
param_value = param.get_value()
if pretrained.size != param_value.size:
raise Exception(
"Size mismatch for parameter %s. Expected %i, found %i."
% (name, param_value.size, pretrained.size)
)
param.set_value(np.reshape(
pretrained, param_value.shape
).astype(np.float32))
def shared(shape, name):
"""
Create a shared object of a numpy array.
"""
if len(shape) == 1:
value = np.zeros(shape) # bias are initialized with zeros
else:
drange = np.sqrt(6. / (np.sum(shape)))
value = drange * np.random.uniform(low=-1.0, high=1.0, size=shape)
return theano.shared(value=value.astype(theano.config.floatX), name=name)
def create_dico(item_list):
"""
Create a dictionary of items from a list of list of items.
"""
assert type(item_list) is list
dico = {}
for items in item_list:
for item in items:
if item not in dico:
dico[item] = 1
else:
dico[item] += 1
return dico
def create_mapping(dico):
"""
Create a mapping (item to ID / ID to item) from a dictionary.
Items are ordered by decreasing frequency.
"""
sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
item_to_id = {v: k for k, v in id_to_item.items()}
return item_to_id, id_to_item
def zero_digits(s):
"""
Replace every digit in a string by a zero.
"""
return re.sub('\d', '0', s)
def iob2(tags):
"""
Check that tags have a valid IOB format.
Tags in IOB1 format are converted to IOB2.
"""
for i, tag in enumerate(tags):
if tag == 'O':
continue
split = tag.split('-')
if len(split) != 2 or split[0] not in ['I', 'B']:
return False
if split[0] == 'B':
continue
elif i == 0 or tags[i - 1] == 'O': # conversion IOB1 to IOB2
tags[i] = 'B' + tag[1:]
elif tags[i - 1][1:] == tag[1:]:
continue
else: # conversion IOB1 to IOB2
tags[i] = 'B' + tag[1:]
return True
def iob_iobes(tags):
"""
IOB -> IOBES
"""
new_tags = []
for i, tag in enumerate(tags):
if tag == 'O':
new_tags.append(tag)
elif tag.split('-')[0] == 'B':
if i + 1 != len(tags) and \
tags[i + 1].split('-')[0] == 'I':
new_tags.append(tag)
else:
new_tags.append(tag.replace('B-', 'S-'))
elif tag.split('-')[0] == 'I':
if i + 1 < len(tags) and \
tags[i + 1].split('-')[0] == 'I':
new_tags.append(tag)
else:
new_tags.append(tag.replace('I-', 'E-'))
else:
raise Exception('Invalid IOB format!')
return new_tags
def iobes_iob(tags):
"""
IOBES -> IOB
"""
new_tags = []
for i, tag in enumerate(tags):
if tag.split('-')[0] == 'B':
new_tags.append(tag)
elif tag.split('-')[0] == 'I':
new_tags.append(tag)
elif tag.split('-')[0] == 'S':
new_tags.append(tag.replace('S-', 'B-'))
elif tag.split('-')[0] == 'E':
new_tags.append(tag.replace('E-', 'I-'))
elif tag.split('-')[0] == 'O':
new_tags.append(tag)
else:
raise Exception('Invalid format!')
return new_tags
def insert_singletons(words, singletons, p=0.5):
"""
Replace singletons by the unknown word with a probability p.
"""
new_words = []
for word in words:
if word in singletons and np.random.uniform() < p:
new_words.append(0)
else:
new_words.append(word)
return new_words
def pad_word_chars(words):
"""
Pad the characters of the words in a sentence.
Input:
- list of lists of ints (list of words, a word being a list of char indexes)
Output:
- padded list of lists of ints
- padded list of lists of ints (where chars are reversed)
- list of ints corresponding to the index of the last character of each word
"""
max_length = max([len(word) for word in words])
char_for = []
char_rev = []
char_pos = []
for word in words:
padding = [0] * (max_length - len(word))
char_for.append(word + padding)
char_rev.append(word[::-1] + padding)
char_pos.append(len(word) - 1)
return char_for, char_rev, char_pos
def create_input(data, parameters, add_label, singletons=None):
"""
Take sentence data and return an input for
the training or the evaluation function.
"""
words = data['words']
chars = data['chars']
if singletons is not None:
words = insert_singletons(words, singletons)
if parameters['cap_dim']:
caps = data['caps']
char_for, char_rev, char_pos = pad_word_chars(chars)
input = []
if parameters['word_dim']:
input.append(words)
if parameters['char_dim']:
input.append(char_for)
if parameters['char_bidirect']:
input.append(char_rev)
input.append(char_pos)
if parameters['cap_dim']:
input.append(caps)
if add_label:
input.append(data['tags'])
return input
def evaluate(parameters, f_eval, raw_sentences, parsed_sentences,
id_to_tag, dictionary_tags, eval_id):
"""
Evaluate current model using CoNLL script.
"""
n_tags = len(id_to_tag)
predictions = []
count = np.zeros((n_tags, n_tags), dtype=np.int32)
for raw_sentence, data in zip(raw_sentences, parsed_sentences):
input = create_input(data, parameters, False)
if parameters['crf']:
y_preds = np.array(f_eval(*input))[1:-1]
else:
y_preds = f_eval(*input).argmax(axis=1)
y_reals = np.array(data['tags']).astype(np.int32)
assert len(y_preds) == len(y_reals)
p_tags = [id_to_tag[y_pred] for y_pred in y_preds]
r_tags = [id_to_tag[y_real] for y_real in y_reals]
if parameters['tag_scheme'] == 'iobes':
p_tags = iobes_iob(p_tags)
r_tags = iobes_iob(r_tags)
for i, (y_pred, y_real) in enumerate(zip(y_preds, y_reals)):
new_line = " ".join(raw_sentence[i][:-1] + [r_tags[i], p_tags[i]])
predictions.append(new_line)
count[y_real, y_pred] += 1
predictions.append("")
# Write predictions to disk and run CoNLL script externally
#eval_id = np.random.randint(1000000, 2000000)
output_path = os.path.join(eval_temp, "eval.%i.output" % eval_id)
scores_path = os.path.join(eval_temp, "eval.%i.scores" % eval_id)
with codecs.open(output_path, 'w', 'utf8') as f:
f.write("\n".join(predictions))
os.system("%s < %s > %s" % (eval_script, output_path, scores_path))
# CoNLL evaluation results
eval_lines = [l.rstrip() for l in codecs.open(scores_path, 'r', 'utf8')]
#trainLog = open('train.log', 'w')
for line in eval_lines:
print line
#trainLog.write("%s\n" % line)
# Remove temp files
# os.remove(output_path)
# os.remove(scores_path)
# Confusion matrix with accuracy for each tag
print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
"ID", "NE", "Total",
*([id_to_tag[i] for i in xrange(n_tags)] + ["Percent"])
)
for i in xrange(n_tags):
print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
str(i), id_to_tag[i], str(count[i].sum()),
*([count[i][j] for j in xrange(n_tags)] +
["%.3f" % (count[i][i] * 100. / max(1, count[i].sum()))])
)
# Global accuracy
print "%i/%i (%.5f%%)" % (
count.trace(), count.sum(), 100. * count.trace() / max(1, count.sum())
)
# F1 on all entities
return float(eval_lines[1].strip().split()[-1])
When i compile the code as it is i always get the error.I think its either because of restriction on path length in windows or it needs or slashes. I dont know what to add to subtract in order to resolve the problem.
run train.py --train lstm/fold1/train --dev lstm/fold1/dev --test lstm/fold1/test
WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10). Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29
Using gpu device 0: GeForce GT 620M (CNMeM is enabled with initial size: 85.0% of memory, cuDNN not available)
Traceback (most recent call last):
File "E:\New-Code\tagger-master\tagger-master\train.py", line 135, in
model = Model(parameters=parameters, models_path=models_path)
File "model.py", line 36, in init
os.makedirs(self.model_path)
File "C:\Users\Acer\Anaconda2\envs\env_name27\lib\os.py", line 157, in makedirs
mkdir(name, mode)
WindowsError: [Error 3] The system cannot find the path specified: './models\tag_scheme=iob,lower=False,zeros=False,char_dim=25,char_lstm_dim=25,char_bidirect=True,word_dim=100,word_lstm_dim=100,word_bidirect=True,pre_emb=,all_emb=False,cap_dim=0,crf=True,dropout=0.3,lr_method=sgd-lr_.005'
In windows pathe is given by back slash \ instead of forward slash / which is used in linux/unix.
Try it like blow if file is 1 folder back:
models_path = "..\models"
eval_path = "..\evaluation"

Python multilevel dict to strings

I have a python dictionary and a dictionary with in some of the values. I'm trying to generate a dotted delimited string of the keys in the structure with the value at the end. With the example below I'd want FIELD0 1 and NAME. I could create a for loop to process the data or a recursive function. I didn't know if there was something prebuilt method for collapsing a multilevel dictionary to delimited strings?
I was trying the following but as you know it will just append the sub dictionaries.
'.'.join('%s %s\n' % i for i in a.items())
{'BOGUS1': 'BOGUS_VAL1',
'BOGUS2': 'BOGUS_VAL1',
'FIELD0': {'F0_VAL1': 1, 'F0_VAL2': 2},
'FIELD1': {'F1_VAL1': 80, 'F1_VAL2': 67, 'F1_VAL3': 100},
'FOOBAR1': 'FB_VAL1',
'NAME': 'VALUE'}
BOGUS2.BOGUS_VAL1
.NAME.VALUE
.BOGUS1.BOGUS_VAL1
.FIELD0.{'F0_VAL1': 1, 'F0_VAL2': 2}
.FIELD1.{'F1_VAL2': 67, 'F1_VAL3': 100, 'F1_VAL1': 80}
.FOOBAR1.FB_VAL1
# Wanted results
FIELD0.F0_VAL1 1
FIELD0.F0_VAL2 2
FIELD1.F1_VAL1 80
FIELD1.F2_VAL1 67
FIELD1.F3_VAL1 100
NAME VALUE
How about something like this:
def dotnotation(d, prefix = ''):
for k, v in d.items():
if type(v) == type(dict()):
dotnotation(v, prefix + str(k) + '.')
else:
print prefix + str(k) + ' = ' + str(v)
Also the formatting can be changed according to the stored types. This should work with your example.
Here is my approach:
def dotted_keys(dic):
""" Generated dot notation keys from a dictionary """
queue = [(None, dic)] # A queue of (prefix, object)
while queue:
prefix, current = queue.pop(0)
for k, v in current.iteritems():
if isinstance(v, dict):
queue.append((k, v))
elif prefix:
yield prefix + '.' + k
else:
yield k
def dict_search(dic, dotted_key, default=None):
""" Take a dictionary and a dotted key and return the value. If not
found, return the value specified by the default parameter.
Example: dict_search(d, 'FIELD0.F0_VAL2')
"""
current = dic
keys = dotted_key.split('.')
for k in keys:
if k in current:
current = current[k]
else:
return default
return current
if __name__ == '__main__':
d = {
'BOGUS1': 'BOGUS_VAL1',
'BOGUS2': 'BOGUS_VAL1',
'FIELD0': {'F0_VAL1': 1, 'F0_VAL2': 2, 'XYZ': {'X1': 9}},
'FIELD1': {'F1_VAL1': 80, 'F1_VAL2': 67, 'F1_VAL3': 100},
'FOOBAR1': 'FB_VAL1',
'NAME': 'VALUE'
}
for k in dotted_keys(d):
print(k, '=', dict_search(d, k))
Output:
BOGUS2 = BOGUS_VAL1
NAME = VALUE
BOGUS1 = BOGUS_VAL1
FOOBAR1 = FB_VAL1
FIELD0.F0_VAL1 = 1
FIELD0.F0_VAL2 = 2
FIELD1.F1_VAL2 = 67
FIELD1.F1_VAL3 = 100
FIELD1.F1_VAL1 = 80
XYZ.X1 = None
The dotted_keys function generates a list of keys in dotted notation while the dict_search function takes a dotted key and return a value.

Can I use bisect to print the content of a line?

I have a file where each line is ordered alphabetically. The file is 12Gb, which means I can't simply read it line by line. The data looks like this:
brown 0 1 0 1 2
fox 3 5 0 0 1
jumped 2 0 6 1 0
The words at the beginning of each line are unique. The word and the numbers on each line are separated by tabs. I want to be able to query the file for specific keywords. For example, if I query "fox", the program should return "fox 3 5 0 0 1".
It seems that a good candidate for this would be the bisect module: https://docs.python.org/3.0/library/bisect.html
I found a post which uses bisect to find out the line number of a keyword: How do I perform binary search on a text file to search a keyword in python?
This is what the code looks like:
import bisect
import os
class Query(object):
def __init__(self, query, index=5):
self.query = query
self.index = index
def __lt__(self, comparable):
return self.query < comparable[self.index:]
class FileSearcher(object):
def __init__(self, file_pointer, record_size=35):
self.file_pointer = file_pointer
self.file_pointer.seek(0, os.SEEK_END)
self.record_size = record_size + len(os.linesep)
self.num_bytes = self.file_pointer.tell()
self.file_size = (self.num_bytes // self.record_size)
def __len__(self):
return self.file_size
def __getitem__(self, item):
self.file_pointer.seek(item * self.record_size)
return self.file_pointer.read(self.record_size)
with open('myfile') as file_to_search:
query = 'fox\t' #token to query
wrapped_query = Query(query)
searchable_file = FileSearcher(file_to_search)
linepos = bisect.bisect(searchable_file, wrapped_query)
print "Located # line: ", linepos
#print content of line?
However, I can't figure out how to actually print the content of the line. I should at least add a read statement somewhere, but I don't know where.
Is it possible to print the content of the line with the bisect module?
If you want go with Python solution, you can do the following:
Read file by small chunks of MAX_LINE bytes, each time moving forward by fixed offset
That offset determines block size
For each such read, determine the key (first word in a line)
These keys serve as delimiters of blocks
Construct the list of such keys. The list would be sorted as keys are ordered
You may persist such list somewhere via pickle/json.dumps/...
When quering, find via bisect the index of a block where you key is located
Read that block entirely and find the key with data
Here is the example file bigfile:
abc 4
bar 2
baz 3
egg 6
foo 1
god 8
ham 5
sex 7
The code:
import os
from bisect import bisect
MAX_LINE = 7
BLOCK_SIZE = 10
def parse_chunks(filename):
size = os.path.getsize(filename)
chunks = []
with open(filename, 'rb') as file:
block = str(file.read(MAX_LINE*2))
first_line = block[:block.find('\n') + 1]
chunks.append(first_line.split()[0])
pos = BLOCK_SIZE
while pos < size:
file.seek(pos)
block = str(file.read(MAX_LINE*2))
first_eol = block.find('\n')
second_eol = block.find('\n', first_eol + 1)
if first_eol == -1 or second_eol == -1:
break
line = block[first_eol + 1:second_eol]
key = line.split()[0]
chunks.append(key)
pos += BLOCK_SIZE
return chunks
if __name__ == '__main__':
BLOCK_SIZE = 10
filename = 'bigfile'
chunks = parse_chunks(filename)
query = 'abc'
pos_before = bisect(chunks, query) - 1
with open(filename, 'rb') as file:
file.seek(pos_before*BLOCK_SIZE)
block = str(file.read(BLOCK_SIZE + MAX_LINE))
line_start = block.find(query)
line_end = block.find('\n', line_start + 1)
line = block[line_start:line_end]
print(line)
In this toy example I use block size of 10 bytes, in your case of 12GB file I'd suggest you to start with 1M.
The following recursive function should be able to narrow the search interval. I'm not sure that you can modify it so that it returns a match or None for no match.
def bisearch(f, word, i, j)
if (j-1)<1E6: return i,j
k = (i+j)/2
f.seek(k)
while k<j:
c = f.read(1)
k = k+1
if c == '\n': break
else:
# ??? no match ??? I'm not sure
w = []
while 1:
c = f.read(1)
if c == '\t': break
w.append(c)
w = "".join(w)
if w == word:
return k, k
if w < word:
return bisearch(f, word, k, j)
else:
return bisearch(f, word, i, k)
and here an example of usage
word = ...
f = open(...)
i,j = bisearch(f, word, 0, len_f)
f.seek(i)
if i==j:
line = f.readline()
else:
#################### EDIT ################
# OLD
# buffer = f.read(1E6)
# NEW
buffer = f.read(j-i)
lenw = len(word)
for line in buffer.split('\n'):
if line[:lenw] == word: break
else:
# no matches, SOS
result = process(line)
Try seeking to the line in question and using readline.
print "Located # line: ", linepos
file_to_search.seek(linepos)
line = file_to_search.readline()
This is assuming linepos is the position of the line, counted in bytes from the beginning of the file. If it's the position counted in line numbers, you'll need to multiply by the number of bytes per line before seeking.
print "Located # line: ", linepos
file_to_search.seek(linepos * searchable_file.record_size)
line = file_to_search.readline()

Categories

Resources