Case insensitive replace - python

What's the easiest way to do a case-insensitive string replacement in Python?

The string type doesn't support this. You're probably best off using the regular expression sub method with the re.IGNORECASE option.
>>> import re
>>> insensitive_hippo = re.compile(re.escape('hippo'), re.IGNORECASE)
>>> insensitive_hippo.sub('giraffe', 'I want a hIPpo for my birthday')
'I want a giraffe for my birthday'

import re
pattern = re.compile("hello", re.IGNORECASE)
pattern.sub("bye", "hello HeLLo HELLO")
# 'bye bye bye'

In a single line:
import re
re.sub("(?i)hello","bye", "hello HeLLo HELLO") #'bye bye bye'
re.sub("(?i)he\.llo","bye", "he.llo He.LLo HE.LLO") #'bye bye bye'
Or, use the optional "flags" argument:
import re
re.sub("hello", "bye", "hello HeLLo HELLO", flags=re.I) #'bye bye bye'
re.sub("he\.llo", "bye", "he.llo He.LLo HE.LLO", flags=re.I) #'bye bye bye'

Continuing on bFloch's answer, this function will change not one, but all occurrences of old with new - in a case insensitive fashion.
def ireplace(old, new, text):
idx = 0
while idx < len(text):
index_l = text.lower().find(old.lower(), idx)
if index_l == -1:
return text
text = text[:index_l] + new + text[index_l + len(old):]
idx = index_l + len(new)
return text

Like Blair Conrad says string.replace doesn't support this.
Use the regex re.sub, but remember to escape the replacement string first. Note that there's no flags-option in 2.6 for re.sub, so you'll have to use the embedded modifier '(?i)' (or a RE-object, see Blair Conrad's answer). Also, another pitfall is that sub will process backslash escapes in the replacement text, if a string is given. To avoid this one can instead pass in a lambda.
Here's a function:
import re
def ireplace(old, repl, text):
return re.sub('(?i)'+re.escape(old), lambda m: repl, text)
>>> ireplace('hippo?', 'giraffe!?', 'You want a hiPPO?')
'You want a giraffe!?'
>>> ireplace(r'[binfolder]', r'C:\Temp\bin', r'[BinFolder]\test.exe')
'C:\\Temp\\bin\\test.exe'

This function uses both the str.replace() and re.findall() functions.
It will replace all occurences of pattern in string with repl in a case-insensitive way.
def replace_all(pattern, repl, string) -> str:
occurences = re.findall(pattern, string, re.IGNORECASE)
for occurence in occurences:
string = string.replace(occurence, repl)
return string

This doesn't require RegularExp
def ireplace(old, new, text):
"""
Replace case insensitive
Raises ValueError if string not found
"""
index_l = text.lower().index(old.lower())
return text[:index_l] + new + text[index_l + len(old):]

An interesting observation about syntax details and options:
# Python 3.7.2 (tags/v3.7.2:9a3ffc0492, Dec 23 2018, 23:09:28) [MSC v.1916 64 bit (AMD64)] on win32
>>> import re
>>> old = "TREEROOT treeroot TREerOot"
>>> re.sub(r'(?i)treeroot', 'grassroot', old)
'grassroot grassroot grassroot'
>>> re.sub(r'treeroot', 'grassroot', old)
'TREEROOT grassroot TREerOot'
>>> re.sub(r'treeroot', 'grassroot', old, flags=re.I)
'grassroot grassroot grassroot'
>>> re.sub(r'treeroot', 'grassroot', old, re.I)
'TREEROOT grassroot TREerOot'
Using the (?i) prefix in the match expression or adding flags=re.I as a fourth argument will result in a case-insensitive match - however using just re.I as the fourth argument does not result in case-insensitive match.
For comparison:
>>> re.findall(r'treeroot', old, re.I)
['TREEROOT', 'treeroot', 'TREerOot']
>>> re.findall(r'treeroot', old)
['treeroot']

I was having \t being converted to the escape sequences (scroll a bit down), so I noted that re.sub converts backslashed escaped characters to escape sequences.
To prevent that I wrote the following:
Replace case insensitive.
import re
def ireplace(findtxt, replacetxt, data):
return replacetxt.join( re.compile(findtxt, flags=re.I).split(data) )
Also, if you want it to replace with the escape characters, like the other answers here that are getting the special meaning bashslash characters converted to escape sequences, just decode your find and, or replace string. In Python 3, might have to do something like .decode("unicode_escape") # python3
findtxt = findtxt.decode('string_escape') # python2
replacetxt = replacetxt.decode('string_escape') # python2
data = ireplace(findtxt, replacetxt, data)
Tested in Python 2.7.8

i='I want a hIPpo for my birthday'
key='hippo'
swp='giraffe'
o=(i.lower().split(key))
c=0
p=0
for w in o:
o[c]=i[p:p+len(w)]
p=p+len(key+w)
c+=1
print(swp.join(o))

Related

Removing all punctuation except white spaces python [duplicate]

It seems like there should be a simpler way than:
import string
s = "string. With. Punctuation?" # Sample string
out = s.translate(string.maketrans("",""), string.punctuation)
Is there?
From an efficiency perspective, you're not going to beat
s.translate(None, string.punctuation)
For higher versions of Python use the following code:
s.translate(str.maketrans('', '', string.punctuation))
It's performing raw string operations in C with a lookup table - there's not much that will beat that but writing your own C code.
If speed isn't a worry, another option though is:
exclude = set(string.punctuation)
s = ''.join(ch for ch in s if ch not in exclude)
This is faster than s.replace with each char, but won't perform as well as non-pure python approaches such as regexes or string.translate, as you can see from the below timings. For this type of problem, doing it at as low a level as possible pays off.
Timing code:
import re, string, timeit
s = "string. With. Punctuation"
exclude = set(string.punctuation)
table = string.maketrans("","")
regex = re.compile('[%s]' % re.escape(string.punctuation))
def test_set(s):
return ''.join(ch for ch in s if ch not in exclude)
def test_re(s): # From Vinko's solution, with fix.
return regex.sub('', s)
def test_trans(s):
return s.translate(table, string.punctuation)
def test_repl(s): # From S.Lott's solution
for c in string.punctuation:
s=s.replace(c,"")
return s
print "sets :",timeit.Timer('f(s)', 'from __main__ import s,test_set as f').timeit(1000000)
print "regex :",timeit.Timer('f(s)', 'from __main__ import s,test_re as f').timeit(1000000)
print "translate :",timeit.Timer('f(s)', 'from __main__ import s,test_trans as f').timeit(1000000)
print "replace :",timeit.Timer('f(s)', 'from __main__ import s,test_repl as f').timeit(1000000)
This gives the following results:
sets : 19.8566138744
regex : 6.86155414581
translate : 2.12455511093
replace : 28.4436721802
Regular expressions are simple enough, if you know them.
import re
s = "string. With. Punctuation?"
s = re.sub(r'[^\w\s]','',s)
For the convenience of usage, I sum up the note of striping punctuation from a string in both Python 2 and Python 3. Please refer to other answers for the detailed description.
Python 2
import string
s = "string. With. Punctuation?"
table = string.maketrans("","")
new_s = s.translate(table, string.punctuation) # Output: string without punctuation
Python 3
import string
s = "string. With. Punctuation?"
table = str.maketrans(dict.fromkeys(string.punctuation)) # OR {key: None for key in string.punctuation}
new_s = s.translate(table) # Output: string without punctuation
myString.translate(None, string.punctuation)
Not necessarily simpler, but a different way, if you are more familiar with the re family.
import re, string
s = "string. With. Punctuation?" # Sample string
out = re.sub('[%s]' % re.escape(string.punctuation), '', s)
string.punctuation is ASCII only! A more correct (but also much slower) way is to use the unicodedata module:
# -*- coding: utf-8 -*-
from unicodedata import category
s = u'String — with - «punctation »...'
s = ''.join(ch for ch in s if category(ch)[0] != 'P')
print 'stripped', s
You can generalize and strip other types of characters as well:
''.join(ch for ch in s if category(ch)[0] not in 'SP')
It will also strip characters like ~*+§$ which may or may not be "punctuation" depending on one's point of view.
I usually use something like this:
>>> s = "string. With. Punctuation?" # Sample string
>>> import string
>>> for c in string.punctuation:
... s= s.replace(c,"")
...
>>> s
'string With Punctuation'
For Python 3 str or Python 2 unicode values, str.translate() only takes a dictionary; codepoints (integers) are looked up in that mapping and anything mapped to None is removed.
To remove (some?) punctuation then, use:
import string
remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
s.translate(remove_punct_map)
The dict.fromkeys() class method makes it trivial to create the mapping, setting all values to None based on the sequence of keys.
To remove all punctuation, not just ASCII punctuation, your table needs to be a little bigger; see J.F. Sebastian's answer (Python 3 version):
import unicodedata
import sys
remove_punct_map = dict.fromkeys(i for i in range(sys.maxunicode)
if unicodedata.category(chr(i)).startswith('P'))
string.punctuation misses loads of punctuation marks that are commonly used in the real world. How about a solution that works for non-ASCII punctuation?
import regex
s = u"string. With. Some・Really Weird、Non?ASCII。 「(Punctuation)」?"
remove = regex.compile(ur'[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}]+', regex.UNICODE)
remove.sub(u" ", s).strip()
Personally, I believe this is the best way to remove punctuation from a string in Python because:
It removes all Unicode punctuation
It's easily modifiable, e.g. you can remove the \{S} if you want to remove punctuation, but keep symbols like $.
You can get really specific about what you want to keep and what you want to remove, for example \{Pd} will only remove dashes.
This regex also normalizes whitespace. It maps tabs, carriage returns, and other oddities to nice, single spaces.
This uses Unicode character properties, which you can read more about on Wikipedia.
I haven't seen this answer yet. Just use a regex; it removes all characters besides word characters (\w) and number characters (\d), followed by a whitespace character (\s):
import re
s = "string. With. Punctuation?" # Sample string
out = re.sub(ur'[^\w\d\s]+', '', s)
Here's a one-liner for Python 3.5:
import string
"l*ots! o(f. p#u)n[c}t]u[a'ti\"on#$^?/".translate(str.maketrans({a:None for a in string.punctuation}))
This might not be the best solution however this is how I did it.
import string
f = lambda x: ''.join([i for i in x if i not in string.punctuation])
import re
s = "string. With. Punctuation?" # Sample string
out = re.sub(r'[^a-zA-Z0-9\s]', '', s)
Here is a function I wrote. It's not very efficient, but it is simple and you can add or remove any punctuation that you desire:
def stripPunc(wordList):
"""Strips punctuation from list of words"""
puncList = [".",";",":","!","?","/","\\",",","#","#","$","&",")","(","\""]
for punc in puncList:
for word in wordList:
wordList=[word.replace(punc,'') for word in wordList]
return wordList
>>> s = "string. With. Punctuation?"
>>> s = re.sub(r'[^\w\s]','',s)
>>> re.split(r'\s*', s)
['string', 'With', 'Punctuation']
Just as an update, I rewrote the #Brian example in Python 3 and made changes to it to move regex compile step inside of the function. My thought here was to time every single step needed to make the function work. Perhaps you are using distributed computing and can't have regex object shared between your workers and need to have re.compile step at each worker. Also, I was curious to time two different implementations of maketrans for Python 3
table = str.maketrans({key: None for key in string.punctuation})
vs
table = str.maketrans('', '', string.punctuation)
Plus I added another method to use set, where I take advantage of intersection function to reduce number of iterations.
This is the complete code:
import re, string, timeit
s = "string. With. Punctuation"
def test_set(s):
exclude = set(string.punctuation)
return ''.join(ch for ch in s if ch not in exclude)
def test_set2(s):
_punctuation = set(string.punctuation)
for punct in set(s).intersection(_punctuation):
s = s.replace(punct, ' ')
return ' '.join(s.split())
def test_re(s): # From Vinko's solution, with fix.
regex = re.compile('[%s]' % re.escape(string.punctuation))
return regex.sub('', s)
def test_trans(s):
table = str.maketrans({key: None for key in string.punctuation})
return s.translate(table)
def test_trans2(s):
table = str.maketrans('', '', string.punctuation)
return(s.translate(table))
def test_repl(s): # From S.Lott's solution
for c in string.punctuation:
s=s.replace(c,"")
return s
print("sets :",timeit.Timer('f(s)', 'from __main__ import s,test_set as f').timeit(1000000))
print("sets2 :",timeit.Timer('f(s)', 'from __main__ import s,test_set2 as f').timeit(1000000))
print("regex :",timeit.Timer('f(s)', 'from __main__ import s,test_re as f').timeit(1000000))
print("translate :",timeit.Timer('f(s)', 'from __main__ import s,test_trans as f').timeit(1000000))
print("translate2 :",timeit.Timer('f(s)', 'from __main__ import s,test_trans2 as f').timeit(1000000))
print("replace :",timeit.Timer('f(s)', 'from __main__ import s,test_repl as f').timeit(1000000))
This is my results:
sets : 3.1830138750374317
sets2 : 2.189873124472797
regex : 7.142953420989215
translate : 4.243278483860195
translate2 : 2.427158243022859
replace : 4.579746678471565
A one-liner might be helpful in not very strict cases:
''.join([c for c in s if c.isalnum() or c.isspace()])
Here's a solution without regex.
import string
input_text = "!where??and!!or$$then:)"
punctuation_replacer = string.maketrans(string.punctuation, ' '*len(string.punctuation))
print ' '.join(input_text.translate(punctuation_replacer).split()).strip()
Output>> where and or then
Replaces the punctuations with spaces
Replace multiple spaces in between words with a single space
Remove the trailing spaces, if any with
strip()
Why none of you use this?
''.join(filter(str.isalnum, s))
Too slow?
I was looking for a really simple solution. here's what I got:
import re
s = "string. With. Punctuation?"
s = re.sub(r'[\W\s]', ' ', s)
print(s)
'string With Punctuation '
Here's one other easy way to do it using RegEx
import re
punct = re.compile(r'(\w+)')
sentence = 'This ! is : a # sample $ sentence.' # Text with punctuation
tokenized = [m.group() for m in punct.finditer(sentence)]
sentence = ' '.join(tokenized)
print(sentence)
'This is a sample sentence'
# FIRST METHOD
# Storing all punctuations in a variable
punctuation='!?,.:;"\')(_-'
newstring ='' # Creating empty string
word = raw_input("Enter string: ")
for i in word:
if(i not in punctuation):
newstring += i
print ("The string without punctuation is", newstring)
# SECOND METHOD
word = raw_input("Enter string: ")
punctuation = '!?,.:;"\')(_-'
newstring = word.translate(None, punctuation)
print ("The string without punctuation is",newstring)
# Output for both methods
Enter string: hello! welcome -to_python(programming.language)??,
The string without punctuation is: hello welcome topythonprogramminglanguage
with open('one.txt','r')as myFile:
str1=myFile.read()
print(str1)
punctuation = ['(', ')', '?', ':', ';', ',', '.', '!', '/', '"', "'"]
for i in punctuation:
str1 = str1.replace(i," ")
myList=[]
myList.extend(str1.split(" "))
print (str1)
for i in myList:
print(i,end='\n')
print ("____________")
Try that one :)
regex.sub(r'\p{P}','', s)
The question does not have a lot of specifics, so the approach I took is to come up with a solution with the simplest interpretation of the problem: just remove the punctuation.
Note that solutions presented don't account for contracted words (e.g., you're) or hyphenated words (e.g., anal-retentive)...which is debated as to whether they should or shouldn't be treated as punctuations...nor to account for non-English character set or anything like that...because those specifics were not mentioned in the question. Someone argued that space is punctuation, which is technically correct...but to me it makes zero sense in the context of the question at hand.
# using lambda
''.join(filter(lambda c: c not in string.punctuation, s))
# using list comprehension
''.join('' if c in string.punctuation else c for c in s)
Apparently I can't supply edits to the selected answer, so here's an update which works for Python 3. The translate approach is still the most efficient option when doing non-trivial transformations.
Credit for the original heavy lifting to #Brian above. And thanks to #ddejohn for his excellent suggestion for improvement to the original test.
#!/usr/bin/env python3
"""Determination of most efficient way to remove punctuation in Python 3.
Results in Python 3.8.10 on my system using the default arguments:
set : 51.897
regex : 17.901
translate : 2.059
replace : 13.209
"""
import argparse
import re
import string
import timeit
parser = argparse.ArgumentParser()
parser.add_argument("--filename", "-f", default=argparse.__file__)
parser.add_argument("--iterations", "-i", type=int, default=10000)
opts = parser.parse_args()
with open(opts.filename) as fp:
s = fp.read()
exclude = set(string.punctuation)
table = str.maketrans("", "", string.punctuation)
regex = re.compile(f"[{re.escape(string.punctuation)}]")
def test_set(s):
return "".join(ch for ch in s if ch not in exclude)
def test_regex(s): # From Vinko's solution, with fix.
return regex.sub("", s)
def test_translate(s):
return s.translate(table)
def test_replace(s): # From S.Lott's solution
for c in string.punctuation:
s = s.replace(c, "")
return s
opts = dict(globals=globals(), number=opts.iterations)
solutions = "set", "regex", "translate", "replace"
for solution in solutions:
elapsed = timeit.timeit(f"test_{solution}(s)", **opts)
print(f"{solution:<10}: {elapsed:6.3f}")
For serious natural language processing (NLP), you should let a library like SpaCy handle punctuation through tokenization, which you can then manually tweak to your needs.
For example, how do you want to handle hyphens in words? Exceptional cases like abbreviations? Begin and end quotes? URLs? IN NLP it's often useful to separate out a contraction like "let's" into "let" and "'s" for further processing.
Considering unicode. Code checked in python3.
from unicodedata import category
text = 'hi, how are you?'
text_without_punc = ''.join(ch for ch in text if not category(ch).startswith('P'))
You can also do this:
import string
' '.join(word.strip(string.punctuation) for word in 'text'.split())
When you deal with the Unicode strings, I suggest using PyPi regex module because it supports both Unicode property classes (like \p{X} / \P{X}) and POSIX character classes (like [:name:]).
Just install the package by typing pip install regex (or pip3 install regex) in your terminal and hit ENTER.
In case you need to remove punctuation and symbols of any kind (that is, anything other than letters, digits and whitespace) you can use
regex.sub(r'[\p{P}\p{S}]', '', text) # to remove one by one
regex.sub(r'[\p{P}\p{S}]+', '', text) # to remove all consecutive punctuation/symbols with one go
regex.sub(r'[[:punct:]]+', '', text) # Same with a POSIX character class
See a Python demo online:
import regex
text = 'भारत India <><>^$.,,! 002'
new_text = regex.sub(r'[\p{P}\p{S}\s]+', ' ', text).lower().strip()
# OR
# new_text = regex.sub(r'[[:punct:]\s]+', ' ', text).lower().strip()
print(new_text)
# => भारत india 002
Here, I added a whitespace \s pattern to the character class

Replace non-ASCII characters with a single space

I need to replace all non-ASCII (\x00-\x7F) characters with a space. I'm surprised that this is not dead-easy in Python, unless I'm missing something. The following function simply removes all non-ASCII characters:
def remove_non_ascii_1(text):
return ''.join(i for i in text if ord(i)<128)
And this one replaces non-ASCII characters with the amount of spaces as per the amount of bytes in the character code point (i.e. the – character is replaced with 3 spaces):
def remove_non_ascii_2(text):
return re.sub(r'[^\x00-\x7F]',' ', text)
How can I replace all non-ASCII characters with a single space?
Of the myriad of similar SO questions, none address character replacement as opposed to stripping, and additionally address all non-ascii characters not a specific character.
Your ''.join() expression is filtering, removing anything non-ASCII; you could use a conditional expression instead:
return ''.join([i if ord(i) < 128 else ' ' for i in text])
This handles characters one by one and would still use one space per character replaced.
Your regular expression should just replace consecutive non-ASCII characters with a space:
re.sub(r'[^\x00-\x7F]+',' ', text)
Note the + there.
For you the get the most alike representation of your original string I recommend the unidecode module:
# python 2.x:
from unidecode import unidecode
def remove_non_ascii(text):
return unidecode(unicode(text, encoding = "utf-8"))
Then you can use it in a string:
remove_non_ascii("Ceñía")
Cenia
For character processing, use Unicode strings:
PythonWin 3.3.0 (v3.3.0:bd8afb90ebf2, Sep 29 2012, 10:57:17) [MSC v.1600 64 bit (AMD64)] on win32.
>>> s='ABC马克def'
>>> import re
>>> re.sub(r'[^\x00-\x7f]',r' ',s) # Each char is a Unicode codepoint.
'ABC def'
>>> b = s.encode('utf8')
>>> re.sub(rb'[^\x00-\x7f]',rb' ',b) # Each char is a 3-byte UTF-8 sequence.
b'ABC def'
But note you will still have a problem if your string contains decomposed Unicode characters (separate character and combining accent marks, for example):
>>> s = 'mañana'
>>> len(s)
6
>>> import unicodedata as ud
>>> n=ud.normalize('NFD',s)
>>> n
'mañana'
>>> len(n)
7
>>> re.sub(r'[^\x00-\x7f]',r' ',s) # single codepoint
'ma ana'
>>> re.sub(r'[^\x00-\x7f]',r' ',n) # only combining mark replaced
'man ana'
If the replacement character can be '?' instead of a space, then I'd suggest result = text.encode('ascii', 'replace').decode():
"""Test the performance of different non-ASCII replacement methods."""
import re
from timeit import timeit
# 10_000 is typical in the project that I'm working on and most of the text
# is going to be non-ASCII.
text = 'Æ' * 10_000
print(timeit(
"""
result = ''.join([c if ord(c) < 128 else '?' for c in text])
""",
number=1000,
globals=globals(),
))
print(timeit(
"""
result = text.encode('ascii', 'replace').decode()
""",
number=1000,
globals=globals(),
))
Results:
0.7208260721400134
0.009975979187503592
What about this one?
def replace_trash(unicode_string):
for i in range(0, len(unicode_string)):
try:
unicode_string[i].encode("ascii")
except:
#means it's non-ASCII
unicode_string=unicode_string[i].replace(" ") #replacing it with a single space
return unicode_string
As a native and efficient approach, you don't need to use ord or any loop over the characters. Just encode with ascii and ignore the errors.
The following will just remove the non-ascii characters:
new_string = old_string.encode('ascii',errors='ignore')
Now if you want to replace the deleted characters just do the following:
final_string = new_string + b' ' * (len(old_string) - len(new_string))
When we use the ascii() it escapes the non-ascii characters and it doesn't change ascii characters correctly. So my main thought is, it doesn't change the ASCII characters, so I am iterating through the string and checking if the character is changed. If it changed then replacing it with the replacer, what you give.
For example: ' '(a single space) or '?' (with a question mark).
def remove(x, replacer):
for i in x:
if f"'{i}'" == ascii(i):
pass
else:
x=x.replace(i,replacer)
return x
remove('hái',' ')
Result: "h i" (with single space between).
Syntax : remove(str,non_ascii_replacer)
str = Here you will give the string you want to work with.
non_ascii_replacer = Here you will give the replacer which you want to replace all the non ASCII characters with.
Pre-processing using Raku (formerly known as Perl_6)
~$ raku -pe 's:g/ <:!ASCII>+ / /;' file
Sample Input:
Peace be upon you
السلام عليكم
שלום עליכם
Paz sobre vosotros
Sample Output:
Peace be upon you
Paz sobre vosotros
Note, you can get extensive information on the matches using the following code:
~$ raku -ne 'say s:g/ <:!ASCII>+ / /.raku;' file
$( )
$(Match.new(:orig("السلام عليكم"), :from(0), :pos(6)), Match.new(:orig("السلام عليكم"), :from(7), :pos(12)))
$(Match.new(:orig("שלום עליכם"), :from(0), :pos(4)), Match.new(:orig("שלום עליכם"), :from(5), :pos(10)))
$( )
$( )
Or more simply, you can just visualize the replacement blank spaces:
~$ raku -ne 'say S:g/ <:!ASCII>+ / /.raku;' file
"Peace be upon you"
" "
" "
"Paz sobre vosotros"
""
https://docs.raku.org/language/regexes#Unicode_properties
https://www.codesections.com/blog/raku-unicode/
https://raku.org
def filterSpecialChars(strInput):
result = []
for character in strInput:
ordVal = ord(character)
if ordVal < 0 or ordVal > 127:
result.append(' ')
else:
result.append(character)
return ''.join(result)
And call it like this:
result = filterSpecialChars('Ceñía mañana')
print(result)
My problem was that my string contained things like Belgià for België and &#x20AC for the € sign. And I didn't want to replace them with spaces. But wth the right symbol itself.
my solution was string.encode('Latin1').decode('utf-8')
Potentially for a different question, but I'm providing my version of #Alvero's answer (using unidecode). I want to do a "regular" strip on my strings, i.e. the beginning and end of my string for whitespace characters, and then replace only other whitespace characters with a "regular" space, i.e.
"Ceñíaㅤmañanaㅤㅤㅤㅤ"
to
"Ceñía mañana"
,
def safely_stripped(s: str):
return ' '.join(
stripped for stripped in
(bit.strip() for bit in
''.join((c if unidecode(c) else ' ') for c in s).strip().split())
if stripped)
We first replace all non-unicode spaces with a regular space (and join it back again),
''.join((c if unidecode(c) else ' ') for c in s)
And then we split that again, with python's normal split, and strip each "bit",
(bit.strip() for bit in s.split())
And lastly join those back again, but only if the string passes an if test,
' '.join(stripped for stripped in s if stripped)
And with that, safely_stripped('ㅤㅤㅤㅤCeñíaㅤmañanaㅤㅤㅤㅤ') correctly returns 'Ceñía mañana'.
To replace all non-ASCII (\x00-\x7F) characters with a space:
''.join(map(lambda x: x if ord(x) in range(0, 128) else ' ', text))
To replace all visible characters, try this:
import string
''.join(map(lambda x: x if x in string.printable and x not in string.whitespace else ' ', text))
This will give the same result:
''.join(map(lambda x: x if ord(x) in range(32, 128) else ' ', text))

python: how to remove '$'?

All I want to do is remove the dollar sign '$'. This seems simple, but I really don't know why my code isn't working.
import re
input = '$5'
if '$' in input:
input = re.sub(re.compile('$'), '', input)
print input
Input still is '$5' instead of just '5'! Can anyone help?
Try using replace instead:
input = input.replace('$', '')
As Madbreaks has stated, $ means match the end of the line in a regular expression.
Here is a handy link to regular expressions: http://docs.python.org/2/library/re.html
In this case, I'd use str.translate
>>> '$$foo$$'.translate(None,'$')
'foo'
And for benchmarking purposes:
>>> def repl(s):
... return s.replace('$','')
...
>>> def trans(s):
... return s.translate(None,'$')
...
>>> import timeit
>>> s = '$$foo bar baz $ qux'
>>> print timeit.timeit('repl(s)','from __main__ import repl,s')
0.969965934753
>>> print timeit.timeit('trans(s)','from __main__ import trans,s')
0.796354055405
There are a number of differences between str.replace and str.translate. The most notable is that str.translate is useful for switching 1 character with another whereas str.replace replaces 1 substring with another. So, for problems like, I want to delete all characters a,b,c, or I want to change a to d, I suggest str.translate. Conversely, problems like "I want to replace the substring abc with def" are well suited for str.replace.
Note that your example doesn't work because $ has special meaning in regex (it matches at the end of a string). To get it to work with regex you need to escape the $:
>>> re.sub('\$','',s)
'foo bar baz qux'
works OK.
$ is a special character in regular expressions that translates to 'end of the string'
you need to escape it if you want to use it literally
try this:
import re
input = "$5"
if "$" in input:
input = re.sub(re.compile('\$'), '', input)
print input
You need to escape the dollar sign - otherwise python thinks it is an anchor http://docs.python.org/2/library/re.html
import re
fred = "$hdkhsd%$"
print re.sub ("\$","!", fred)
>> !hdkhsd%!
Aside from the other answers, you can also use strip():
input = input.strip('$')

Combining multiple regex substitutions

I'm trying to delete some things from a block of text using regex. I have all of my patterns ready, but I can't seem to be able to remove two (or more) that overlap.
For example:
import re
r1 = r'I am'
r2 = r'am foo'
text = 'I am foo'
re.sub(r1, '', text) # Returns ' foo'
re.sub(r2, '', text) # Returns 'I '
How do I replace both of the occurrences simultaneously and end up with an empty string?
I ended up using a slightly modified version of Ned Batchelder's answer:
def clean(self, text):
mask = bytearray(len(text))
for pattern in patterns:
for match in re.finditer(pattern, text):
r = range(match.start(), match.end())
mask[r] = 'x' * len(r)
return ''.join(character for character, bit in zip(text, mask) if not bit)
You can't do it with consecutive re.sub calls as you have shown. You can use re.finditer to find them all. Each match will provide you with a match object, which has .start and .end attributes indicating their positions. You can collect all those together, and then remove characters at the end.
Here I use a bytearray as a mutable string, used as a mask. It's initialized to zero bytes, and I mark with an 'x' all the bytes that match any regex. Then I use the bit mask to select the characters to keep in the original string, and build a new string with only the unmatched characters:
bits = bytearray(len(text))
for pat in patterns:
for m in re.finditer(pat, text):
bits[m.start():m.end()] = 'x' * (m.end()-m.start())
new_string = ''.join(c for c,bit in zip(text, bits) if not bit)
Not to be a downer, but the short answer is that I'm pretty sure you can't. Can you change your regex so that it doesn't require overlapping?
If you still want to do this, I would try keeping track of the start and stop indices of each match made on the original string. Then go through the string and only keep characters not in any deletion range?
Quite efficient too is a solution coming from ... Perl combine the regexps in one:
# aptitude install regexp-assemble
$ regexp-assemble
I am
I am foo
Ctrl + D
I am(?: foo)?
regexp-assemble takes all the variants of regexps or string you want to match and then
combine them in one. And yes it changes the initial problem to another one since it is not about matching overlapping regexp anymore, but combining regexp for a match
And Then you can use it in your code:
$ python
Python 2.7.3 (default, Aug 1 2012, 05:14:39)
[GCC 4.6.3] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import re
>>> re.sub("I am foo","I am(?: foo)?","")
''
A port of Regexp::Assemble in python would be nice :)
Here is an alternative that filters the strings on the fly using itertools.compress on the text with a selector iterator. The selector returns True if the character should be kept. selector_for_patterns creates one selector for every pattern. The selector are combined with the all function (only when all pattern want to keep a character it should be in the resulting string).
import itertools
import re
def selector_for_pattern(text, pattern):
i = 0
for m in re.finditer(pattern, text):
for _ in xrange(i, m.start()):
yield True
for _ in xrange(m.start(), m.end()):
yield False
i = m.end()
for _ in xrange(i, len(text)):
yield True
def clean(text, patterns):
gen = [selector_for_pattern(text, pattern) for pattern in patterns]
selector = itertools.imap(all, itertools.izip(* gen))
return "".join(itertools.compress(text, selector))

Remove all special characters, punctuation and spaces from string

I need to remove all special characters, punctuation and spaces from a string so that I only have letters and numbers.
This can be done without regex:
>>> string = "Special $#! characters spaces 888323"
>>> ''.join(e for e in string if e.isalnum())
'Specialcharactersspaces888323'
You can use str.isalnum:
S.isalnum() -> bool
Return True if all characters in S are alphanumeric
and there is at least one character in S, False otherwise.
If you insist on using regex, other solutions will do fine. However note that if it can be done without using a regular expression, that's the best way to go about it.
Here is a regex to match a string of characters that are not a letters or numbers:
[^A-Za-z0-9]+
Here is the Python command to do a regex substitution:
re.sub('[^A-Za-z0-9]+', '', mystring)
Shorter way :
import re
cleanString = re.sub('\W+','', string )
If you want spaces between words and numbers substitute '' with ' '
TLDR
I timed the provided answers.
import re
re.sub('\W+','', string)
is typically 3x faster than the next fastest provided top answer.
Caution should be taken when using this option. Some special characters (e.g. ø) may not be striped using this method.
After seeing this, I was interested in expanding on the provided answers by finding out which executes in the least amount of time, so I went through and checked some of the proposed answers with timeit against two of the example strings:
string1 = 'Special $#! characters spaces 888323'
string2 = 'how much for the maple syrup? $20.99? That s ridiculous!!!'
Example 1
'.join(e for e in string if e.isalnum())
string1 - Result: 10.7061979771
string2 - Result: 7.78372597694
Example 2
import re
re.sub('[^A-Za-z0-9]+', '', string)
string1 - Result: 7.10785102844
string2 - Result: 4.12814903259
Example 3
import re
re.sub('\W+','', string)
string1 - Result: 3.11899876595
string2 - Result: 2.78014397621
The above results are a product of the lowest returned result from an average of: repeat(3, 2000000)
Example 3 can be 3x faster than Example 1.
Python 2.*
I think just filter(str.isalnum, string) works
In [20]: filter(str.isalnum, 'string with special chars like !,#$% etcs.')
Out[20]: 'stringwithspecialcharslikeetcs'
Python 3.*
In Python3, filter( ) function would return an itertable object (instead of string unlike in above). One has to join back to get a string from itertable:
''.join(filter(str.isalnum, string))
or to pass list in join use (not sure but can be fast a bit)
''.join([*filter(str.isalnum, string)])
note: unpacking in [*args] valid from Python >= 3.5
#!/usr/bin/python
import re
strs = "how much for the maple syrup? $20.99? That's ricidulous!!!"
print strs
nstr = re.sub(r'[?|$|.|!]',r'',strs)
print nstr
nestr = re.sub(r'[^a-zA-Z0-9 ]',r'',nstr)
print nestr
you can add more special character and that will be replaced by '' means nothing i.e they will be removed.
Differently than everyone else did using regex, I would try to exclude every character that is not what I want, instead of enumerating explicitly what I don't want.
For example, if I want only characters from 'a to z' (upper and lower case) and numbers, I would exclude everything else:
import re
s = re.sub(r"[^a-zA-Z0-9]","",s)
This means "substitute every character that is not a number, or a character in the range 'a to z' or 'A to Z' with an empty string".
In fact, if you insert the special character ^ at the first place of your regex, you will get the negation.
Extra tip: if you also need to lowercase the result, you can make the regex even faster and easier, as long as you won't find any uppercase now.
import re
s = re.sub(r"[^a-z0-9]","",s.lower())
string.punctuation contains following characters:
'!"#$%&\'()*+,-./:;<=>?#[\]^_`{|}~'
You can use translate and maketrans functions to map punctuations to empty values (replace)
import string
'This, is. A test!'.translate(str.maketrans('', '', string.punctuation))
Output:
'This is A test'
s = re.sub(r"[-()\"#/#;:<>{}`+=~|.!?,]", "", s)
Assuming you want to use a regex and you want/need Unicode-cognisant 2.x code that is 2to3-ready:
>>> import re
>>> rx = re.compile(u'[\W_]+', re.UNICODE)
>>> data = u''.join(unichr(i) for i in range(256))
>>> rx.sub(u'', data)
u'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\xaa\xb2 [snip] \xfe\xff'
>>>
The most generic approach is using the 'categories' of the unicodedata table which classifies every single character. E.g. the following code filters only printable characters based on their category:
import unicodedata
# strip of crap characters (based on the Unicode database
# categorization:
# http://www.sql-und-xml.de/unicode-database/#kategorien
PRINTABLE = set(('Lu', 'Ll', 'Nd', 'Zs'))
def filter_non_printable(s):
result = []
ws_last = False
for c in s:
c = unicodedata.category(c) in PRINTABLE and c or u'#'
result.append(c)
return u''.join(result).replace(u'#', u' ')
Look at the given URL above for all related categories. You also can of course filter
by the punctuation categories.
For other languages like German, Spanish, Danish, French etc that contain special characters (like German "Umlaute" as ü, ä, ö) simply add these to the regex search string:
Example for German:
re.sub('[^A-ZÜÖÄa-z0-9]+', '', mystring)
This will remove all special characters, punctuation, and spaces from a string and only have numbers and letters.
import re
sample_str = "Hel&&lo %% Wo$#rl#d"
# using isalnum()
print("".join(k for k in sample_str if k.isalnum()))
# using regex
op2 = re.sub("[^A-Za-z]", "", sample_str)
print(f"op2 = ", op2)
special_char_list = ["$", "#", "#", "&", "%"]
# using list comprehension
op1 = "".join([k for k in sample_str if k not in special_char_list])
print(f"op1 = ", op1)
# using lambda function
op3 = "".join(filter(lambda x: x not in special_char_list, sample_str))
print(f"op3 = ", op3)
Use translate:
import string
def clean(instr):
return instr.translate(None, string.punctuation + ' ')
Caveat: Only works on ascii strings.
This will remove all non-alphanumeric characters except spaces.
string = "Special $#! characters spaces 888323"
''.join(e for e in string if (e.isalnum() or e.isspace()))
Special characters spaces 888323
import re
my_string = """Strings are amongst the most popular data types in Python. We can create the strings by enclosing characters in quotes. Python treats single quotes the
same as double quotes."""
# if we need to count the word python that ends with or without ',' or '.' at end
count = 0
for i in text:
if i.endswith("."):
text[count] = re.sub("^([a-z]+)(.)?$", r"\1", i)
count += 1
print("The count of Python : ", text.count("python"))
After 10 Years, below I wrote there is the best solution.
You can remove/clean all special characters, punctuation, ASCII characters and spaces from the string.
from clean_text import clean
string = 'Special $#! characters spaces 888323'
new = clean(string,lower=False,no_currency_symbols=True, no_punct = True,replace_with_currency_symbol='')
print(new)
Output ==> 'Special characters spaces 888323'
you can replace space if you want.
update = new.replace(' ','')
print(update)
Output ==> 'Specialcharactersspaces888323'
function regexFuntion(st) {
const regx = /[^\w\s]/gi; // allow : [a-zA-Z0-9, space]
st = st.replace(regx, ''); // remove all data without [a-zA-Z0-9, space]
st = st.replace(/\s\s+/g, ' '); // remove multiple space
return st;
}
console.log(regexFuntion('$Hello; # -world--78asdf+-===asdflkj******lkjasdfj67;'));
// Output: Hello world78asdfasdflkjlkjasdfj67
import re
abc = "askhnl#$%askdjalsdk"
ddd = abc.replace("#$%","")
print (ddd)
and you shall see your result as
'askhnlaskdjalsdk

Categories

Resources