How to combine two similar functions that convert between hiragana and katakana?

How to combine two similar functions that convert between hiragana and katakana? - python

I have two functions that convert between katakana and hiragana and they look the same:
katakana_minus_hiragana = 0x30a1 - 0x3041 # KATAKANA LETTER A - HIRAGANA A
def is_hirgana(char):
return 0x3040 < ord(char[0]) and ord(char[0]) < 0x3097
def is_katakana(char):
return 0x30a0 < ord(char[0]) and ord(char[0]) < 0x30f7
def hiragana_to_katakana(hiragana_text):
katakana_text = ""
max_len = 0
for i, char in enumerate(hiragana_text):
if is_hirgana(char):
katakana_text += chr(ord(char) + katakana_minus_hiragana)
max_len += 1
else:
break
return katakana_text, max_len
def katakana_to_hiragana(katakana_text):
hiragana_text = ""
max_len = 0
for i, char in enumerate(katakana_text):
if is_katakana(char):
hiragana_text += chr(ord(char) - katakana_minus_hiragana)
max_len += 1
else:
break
return hiragana_text, max_len
Is there a way to simplify hiragana_to_katakana() and katakana_to_hiragana() into a duck-type function or a super/meta function?
E.g. something like
def convert_hk_kh(text, charset_range, offset):
charset_start, charset_end = charset_range
output_text = ""
max_len = 0
for i, char in enumerate(text):
if charset_start < ord(char[0]) and ord(char[0]) < charset_end:
output_text += chr(ord(char) + offset)
max_len +=1
else:
break
return output_text, max_len
def katakana_to_hiragana(katakana_text):
return convert_hk_kh(katakana_text, (0x30a0, 0x30f7), -katakana_minus_hiragana)
def hiragana_to_katakana(hiragana_text):
return convert_hk_kh(hiragana_text, (0x3040, 0x3097), katakana_minus_hiragana)
Are there other pythonic ways to simplify the two functions that are very similar?
EDITED
There's also https://github.com/olsgaard/Japanese_nlp_scripts which seems to do the same thing with str.translate. Is that more efficient? More pythonic?

I'd do something like this:
KATAKANA_HIRGANA_SHIFT = 0x30a1 - 0x3041 # KATAKANA LETTER A - HIRAGANA A
def shift_chars_prefix(text, amount, condition):
output = ''
for last_index, char in enumerate(text):
if not condition(char):
break
output += chr(ord(char) + amount)
return output, last_index
def katakana_to_hiragana(text):
return shift_chars_prefix(text, -KATAKANA_HIRGANA_SHIFT, lambda c: '\u30a0' < c < '\u30f7')
def hiragana_to_katakana(text):
return shift_chars_prefix(text, KATAKANA_HIRGANA_SHIFT, lambda c: '\u3040' < c < '\u3097')
You can also use regex if you don't return the length of the replaced prefix:
import re
KATAKANA_HIRGANA_SHIFT = 0x30a1 - 0x3041 # KATAKANA LETTER A - HIRAGANA A
def shift_by(n):
def replacer(match):
return ''.join(chr(ord(c) + n) for c in match.group(0))
return replacer
def katakana_to_hiragana(text):
return re.sub(r'^[\u30a1-\u30f6]+', shift_by(KATAKANA_HIRGANA_SHIFT), text)
def hiragana_to_katakana(text):
return re.sub(r'^[\u3041-\u3096]+', shift_by(-KATAKANA_HIRGANA_SHIFT), text)

Here’s a function that would switch each kind of kana to the other.
Unlike the given functions, it does not stop when it encounters
non-kana, but simply passes those characters through without changing
them.
Note that conversion between kana types is not as simple as this; for
example, in hiragana a long “e” sound is indicated by ええ or えい
(e.g., おねえ older sister, せんせい teacher), while in katakana one
uses a chōonpu (オネー, せんせー). There are kana characters outside the
ranges you use as well.
def switch_kana_type(kana_text):
"""Replace each kind of kana with the other kind. Other characters are
passed through unchanged."""
output_text = ''
for c in kana_text:
if is_hiragana(c): # Note typo fix of "is_hirgana"
output_text += chr(ord(c) + katakana_minus_hiragana)
elif is_katakana(char):
output_text += chr(ord(c) - katakana_minus_hiragana)
else:
output_text += c;
return output_text, len(output_text)

Related

Counting occurrences of multiple characters in a string, with python

I'm trying to create a function that -given a string- will return the count of non-allowed characters ('error_char'), like so: 'total count of not-allowed / total length of string'.
So far I've tried:
def allowed_characters(s):
s = s.lower()
correct_char = 'abcdef'
error_char = 'ghijklmnopqrstuvwxyz'
counter = 0
for i in s:
if i in correct_char:
no_error = '0'+'/'+ str(len(s))
return no_error
elif i in error_char:
counter += 1
result = str(sum(counter)) + '/' + str(len(s))
return result
but all I get is '0/56' where I'm expecting '22/56' since m,x,y,z are 'not allowed' and m repeats 19 times
allowed_characters('aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbmmmmmmmmmmmmmmmmmmmxyz')
'0/56'
Then I've tried:
def allowed_characters(s):
s = s.lower()
correct_char = 'abcdef'
error_char = 'ghijklmnopqrstuvwxyz'
counter = 0
for i in s:
if i in correct_char:
no_error = '0'+'/'+ str(len(s))
return no_error
elif i in error_char:
import regex as re
rgx_pattern = re.compile([error_char])
count_e = rgx_pattern.findall(error_char, s)
p_error = sum([count_e.count(i) for i in error_char])
result = str(p_error) + '/' + str(len(s))
But I get the same result...
I've also tried these other ways, but keep getting the same:
def allowed_characters1(s):
s = s.lower()
correct_char = 'abcdef'
for i in s:
if i not in correct_char:
counter = sum([s.count(i) for i in s])
p_error = str(counter) + '/' + str(len(s))
return p_error
elif i in correct_char:
no_error = '0'+'/'+ str(len(s))
return no_error
and...
def allowed_characters2(s):
s = s.lower()
correct_char = 'abcdef'
for i in s:
if i not in correct_char:
counter = sum(s.count(i))
p_error = str(counter) + '/' + str(len(s))
return p_error
elif i in correct_char:
no_error = '0'+'/'+ str(len(s))
return no_error
I've even tried changing the logic and iterating over 'correct/error_char' instead, but nothing seems to work... I keep getting the same result over and over. It looks as though the loop stops right after first character or doesn't run the 'elif' part?

Whenever it comes to do quicker counting - it's always good to think about Counter You can try to simplify your code like this:
Notes - please don't change your Problem Description during the middle of people's answering posts. That make it very hard to keep in-sync.
There is still room to improve it though.
from collections import Counter
def allowed_char(s):
s = s.lower()
correct_char = 'abcdef'
error_char = 'ghijklmnopqrstuvwxyz'
ok_counts = Counter(s)
print(f' allowed: {ok_counts} ')
correct_count = sum(count for c, count in ok_counts.items() if c in correct_char)
error_count = sum(count for c, count in ok_counts.items() if c in error_char)
#return sum(not_ok.values()) / total
return correct_count, error_count # print both
s =('aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbmmmmmmmmmmmmmmmmmmmxyz')
print(allowed_char(s)) # (34, 22)
print(allowed_char('abdmmmmxyz')) # (3, 7)
Alternatively, you really want to use for-loop and learn to process the string of characters, you could try this:
def loop_count(s):
s = s.lower()
correct_count = error_count = 0
for c in s:
if c in correct_char:
correct_count += 1
else:
error_count += 1
return correct_count, error_count

I would use a regex replacement trick here using len():
def allowed_characters(s):
return len(s) - len(re.sub(r'[^ghijklmnopqrstuvwxyz]+', '', s))
The above returns the length of the input string minus the length of the input with all allowed characters removed (alternatively minus the length of the string with only non allowed characters).

finding the longest common prefix of elements inside a list

I have a sequence print(lcp(["flower","flow","flight", "dog"])) which should return fl. Currently I can get it to return flowfl.
I can locate the instances where o or w should be removed, and tried different approaches to remove them. However they seem to hit syntax issue, which I cannot seem to resolve by myself.
I would very much appreciate a little guidance to either have the tools to remedy this issue my self, or learn from a working proposed solution.
def lcp(strs):
if not isinstance(strs, list) or len(strs) == 0:
return ""
if len(strs) == 1:
return strs[0]
original = strs[0]
original_max = len(original)
result = ""
for _, word in enumerate(strs[1:],1):
current_max = len(word)
i = 0
while i < current_max and i < original_max:
copy = "".join(result)
if len(copy) and copy[i-1] not in word:
# result = result.replace(copy[i-1], "")
# result = copy[:i-1]
print(copy[i-1], copy, result.index(copy[i-1]), i, word)
if word[i] == original[i]:
result += word[i]
i += 1
return result
print(lcp(["flower","flow","flight", "dog"])) # returns flowfl should be fl
print(lcp(["dog","car"])) # works
print(lcp(["dog","racecar","car"])) # works
print(lcp([])) # works
print(lcp(["one"])) # works
I worked on an alternative which does not be solve removing inside the same loop, adding a counter at the end. However my instincts suggest it can be solved within the for and while loops without increasing code bloat.
if len(result) > 1:
counter = {char: result.count(char) for char in result}
print(counter)

I have solved this using the below approach.
class Solution:
def longestCommonPrefix(self, strs: List[str]) -> str:
N = len(strs)
if N == 1:
return strs[0]
len_of_small_str, small_str = self.get_min_str(strs)
ans = ""
for i in range(len_of_small_str):
ch = small_str[i]
is_qualified = True
for j in range(N):
if strs[j][i] != ch:
is_qualified = False
break
if is_qualified:
ans += ch
else:
break
return ans
def get_min_str(self, A):
min_len = len(A[0])
s = A[0]
for i in range(1, len(A)):
if len(A[i]) < min_len:
min_len = len(A[i])
s = A[i]
return min_len, s

Returns the longest prefix that the set of words have in common.
def lcp(strs):
if len(strs) == 0:
return ""
result = strs[0]
for word in strs[1:]:
for i, (l1, l2) in enumerate(zip(result, word)):
if l1 != l2:
result = result[:i]
break
else:
result = result[:i+1]
return result
Results:
>>> print(lcp(["flower","flow","flight"]))
fl
>>> print(lcp(["flower","flow","flight", "dog"]))
>>> print(lcp(["dog","car"]))
>>> print(lcp(["dog","racecar","car"]))
>>> print(lcp([]))
>>> print(lcp(["one"]))
one
>>> print(lcp(["one", "one"]))
one

You might need to rephrase your goal.
By your description you don't want the longest common prefix, but the prefix that the most words have in common with the first one.
One of your issues is that your tests only test one real case and four edgecases. Make some more real examples.
Here's my proposition: I mostly added the elif to check if we already have a difference on the first letter to then discard the entry.
It also overwrites the original to rebuild the string based on the common prefix with the next word (if there are any)
def lcp(strs):
if not isinstance(strs, list) or len(strs) == 0:
return ""
if len(strs) == 1:
return strs[0]
original = strs[0]
result = ""
for word in strs[1:]:
i = 0
while i < len(word) and i < len(original) :
if word[i] == original[i]:
result += word[i]
elif i == 0:
result = original
break
i += 1
original = result
result = ""
return original
print(lcp(["flower","flow","flight", "dog"])) # fl
print(lcp(["shift", "shill", "hunter", "shame"])) # sh
print(lcp(["dog","car"])) # dog
print(lcp(["dog","racecar","car"])) # dog
print(lcp(["dog","racecar","dodge"])) # do
print(lcp([])) # [nothing]
print(lcp(["one"])) # one

Function: fill the string to get a length

So I am trying to create a function that recives 3 parameters, a string a number and a char.
If the len(string) is not the same as the number given, I want to fulfill the string with the given char.
I tried doin' this, but didn't work. Where do I miss?
def expand(text, length, char):
new_text = ""
if length <= len(text):
print(text)
else:
diff = length - len(text)
if diff % 2 == 0:
len(new_text) == length
new_text = char + text + char
else:
len(new_text) == length
new_text = char + text + char
new_text = char + text
print(new_text)
I mean how could I create a condition that helps me to add as many "char" I need but in the same time checks if len(text) is the same with the number given? If you could explain me where should I look and how do I have to think about the problem, it would be awesome. I just want to understand the way, not just to have the answer. Thank you!

You can use this approach where multiplying a char by a number will give you a string of that length. eg. 'a' * 3 = 'aaa'
def expand(text, length, char):
new_text = ""
if length <= len(text):
new_text = text
else:
diff = length - len(text)
if diff % 2 == 0:
new_text = char*int(diff/2) + text + char*int(diff/2)
# length of text is automatically updated
else:
new_text = char*int((diff-1)/2) + text + char*int((diff-1)/2)
new_text = char + new_text
print(new_text)
return new_text

You could use a while to add the character until the string reaches the desired length:
new_text = text
while len(text) < length:
if len(text)%2 == 0:
new_text = new_text + char
else
new_text = char + new_text
If you want to generate a certain number of repeating characters, you can multiply the character by a number. e.g. "A"*3 --> "AAA"
So you could approach it like this:
padLeft = (length-len(text))//2
padRight = length-len(text)-padLeft
return char*padLeft + text + char*padRight
There's also the recursive approach:
def expand(text, length, char):
if len(text)>=length: return text
if len(text)%1: return expand(text+char,length,char)
else: return expand(char+text,length,char)

Manual string 'in' function doesn't work if substring is at the end of the word

I'm trying to manually code the python string in function for an assignment. Using this code, where s is the string and t is the substring I am trying to find:
def test(s, t):
stidx = 0
while stidx < len(s):
idx = 0
for i in s[stidx:]:
if idx < len(t):
if t[idx] == i:
idx += 1
continue
else:
break
if idx == len(t):
return True
stidx += 1
return False
The above code works, except when I am checking a substring at the very end of the word (e.g. s = 'happy' and t = 'py'). If I add an arbitrary character to the end of s, it works. Why is this?

maybe?
def test(s, t):
"""
:param s: exp: happy
:param t: exp: py
:return:
"""
tmp = result = 0
t_len = len(t)
while tmp <= len(s)-t_len:
if s[tmp: tmp+t_len] == t:
result = 1
break
else:
tmp += 1
continue
return bool(result)

Encryption code in python, call upon a function, python returns nothing. No error messages show

Okay, so I've written the following series of functions in Python 3.6.0:
def code_char(c, key):
result = ord(c) + key
if c.isupper():
while result > ord("Z"):
result -= 26
while result < ord("A"):
result += 26
return chr(result)
else:
while result > ord("z"):
result -= 26
while result < ord("a"):
result += 26
result = chr(result)
return result
def isletter(char):
if 65 <= ord(char) <= 90 or 97<= ord(char) <= 122:
return True
else:
return False
def encrypt(string, key):
result = ""
length = len(string)
key = key * (length // len(key)) + key[0:(length % len(key))]
for i in range(0,length):
if (isletter for i in string):
c = string[i]
num = int("".join("".join(i) for i in key))
result += code_char(c, num)
else:
c = string[i]
result += i
return result
Then I try to call on the functions with:
encrypt("This is a secret message!!", "12345678")
When python runs the program absolutely nothing happens. Nothing gets returned, and in the shell python forces me onto a blank line without indents, or >>>. i don't know what is right or wrong with the code as no error messages appear, and no results appear. Any kind of advice would be appreciated.
Thank you.

Looking at your code, I don't think this is an infinite loop. I think your loop will not be infinite but will run for a very long time since the value of key is very big, and so, subtracting 26 at a time, until it gets to an English letter ascii value, will just take forever (but not really forever)
>>> key = '12345678'
>>> length = len("This is a secret message!!")
>>> key * (length // len(key)) + key[0:(length % len(key))]
'12345678123456781234567812'
It might be a problem in the your logic, maybe in the logic generating the key, but if this is indeed the logic you want, how about using modulus rather than iterating:
def code_char(c, key):
result = ord(c) + key
if c.isupper():
if result > ord("Z"):
result = ord("Z") + result % 26 - 26
if result < ord("A"):
result = ord("A") - result % 26 + 26
return chr(result)
else:
if result > ord("z"):
result = ord("z") + result % 26 - 26
if result < ord("a"):
result = ord("a") - result % 26 + 26
return chr(result)
def isletter(char):
if 65 <= ord(char) <= 90 or 97<= ord(char) <= 122:
return True
else:
return False
def encrypt(string, key):
result = ""
length = len(string)
key = key * (length // len(key)) + key[0:(length % len(key))]
for i in range(0,length):
if (isletter for i in string):
c = string[i]
num = int("".join("".join(i) for i in key))
result += code_char(c, num)
else:
c = string[i]
result += i
return result
>>> encrypt("This is a secret message!!", "12345678")
'Rlmwrmwrerwigvixrqiwwekiss'

Should you be having while loop here , or are you intening if loop? I don't see any exit for while loop. That may be where your code is hanging.
if c.isupper():
while result > ord("Z"):
result -= 26
while result < ord("A"):
result += 26
return chr(result)
else:
while result > ord("z"):
result -= 26
while result < ord("a"):
result += 26
Also, if I replace while with if above, it's giving me overflow error.
OverflowError: Python int too large to convert to C long
EDIT
After looking at #polo's comment and taking a second look at code, I believe #polo is correct. I put while loop back and added print statements. I have commented them, but you can uncomment at your end.
I've also reduced key's complexity to just key = key and reduced key from 12345678 to just 1234 to see if the code works and if it completes in reasonable time.. You can make it as complex as you want once code runs smoothly.
Here is result I got after:
>>>
key =1234
coding char = T
coding char = h
coding char = i
coding char = s
coding char =
coding char = i
coding char = s
coding char =
coding char = a
coding char =
coding char = s
coding char = e
coding char = c
coding char = r
coding char = e
coding char = t
coding char =
coding char = m
coding char = e
coding char = s
coding char = s
coding char = a
coding char = g
coding char = e
coding char = !
coding char = !
encrypted_message = Ftuezuezmzeqodqfzyqeemsqaa
Modified code below:
def code_char(c, key):
result = ord(c) + key
if c.isupper():
while result > ord("Z"):
#print("result1 = {}",format(result))
result -= 26
while result < ord("A"):
#print("result2 = {}",format(result))
result += 26
return chr(result)
else:
while result > ord("z"):
#print("result3 = {}",format(result))
result -= 26
while result < ord("a"):
#print("result4 = {}",format(result))
result += 26
result = chr(result)
return result
def isletter(char):
if 65 <= ord(char) <= 90 or 97<= ord(char) <= 122:
return True
else:
return False
def encrypt(string, key):
result = ""
length = len(string)
#key = key * (length // len(key)) + key[0:(length % len(key))]
key = key
print "key ={}".format(key)
for i in range(0,length):
if (isletter for i in string):
c = string[i]
num = int("".join("".join(i) for i in key))
print("coding char = {}".format(c))
result += code_char(c, num)
else:
c = string[i]
result += i
return result
#encrypt("This is a secret message!!", "12345678")
encrypted_message = encrypt("This is a secret message!!", "1234")
print("encrypted_message = {}".format(encrypted_message))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to combine two similar functions that convert between hiragana and katakana? - python

Related

Counting occurrences of multiple characters in a string, with python

finding the longest common prefix of elements inside a list

Function: fill the string to get a length

Manual string 'in' function doesn't work if substring is at the end of the word

Encryption code in python, call upon a function, python returns nothing. No error messages show

Categories

Resources