Find strings that begins with a '#' and create link - python

I want to check whether a string (a tweet) begins with a '#' (i.e. is a hashtag) or not, and if so create a link.
Below is what I've tried so far but it doesn't work (error on the last line).
How can I fix this and will the code work for the purpose?
tag_regex = re.compile(r"""
[\b#\w\w+] # hashtag found!""", re.VERBOSE)
message = raw_message
for tag in tag_regex.findall(raw_message):
message = message.replace(url, '' + message + '')

>>> msg = '#my_tag the rest of my tweet'
>>> re.sub('^#(\w+) (.*)', r'\2', msg)
'the rest of my tweet'
>>>

Related

Specific words from a text file with PLY

I am making a lexical analyzer for determined words that are in a .txt file, for this I declare determined words reserved and I try to print only the selected words on the screen, but the result I get is that it takes all the words in the txt file and prints them. I've been following the tutorial and the official Ply documentation in http://www.dabeaz.com/ply/ply.html#ply_nn6 but I still don't achieve my goal. Could someone help me with this? Thank you very much.
import ply.lex as lex
import re
import os
import sys
reservadas = {
'if' : 'if',
'then' : 'then',
'else' : 'else',
'while' : 'while',
}
tokens = ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE',
'ODD','ASSIGN','NE','LT','LTE','GT','GTE',
'LPARENT', 'RPARENT','COMMA','SEMMICOLOM',
'DOT','UPDATE'
] + list(reservadas.values())
#tokens = tokens+reservadas
# reservadas = {
# 'begin':'BEGIN',
# 'end':'END',
# 'if':'IF',
# 'then':'THEN',
# 'while':'WHILE',
# 'do':'DO',
# 'call':'CALL',
# 'const':'CONST',
# 'int':'VAR',
# 'procedure':'PROCEDURE',
# 'out':'OUT',
# 'in':'IN',
# 'else':'ELSE'
# }
#tokens = tokens+list(reservadas.values())
t_ignore = '\t '
t_ignore_PLUS = r'\+'
t_ignore_MINUS = r'\-'
t_ignore_TIMES = r'\*'
t_ignore_DIVIDE = r'/'
t_ignore_ODD = r'ODD'
t_ignore_ASSIGN = r'='
t_ignore_NE = r'<>'
t_ignore_LT = r'<'
t_ignore_LTE = r'<='
t_ignore_GT = r'>'
t_ignore_GTE = r'>='
t_ignore_LPARENT = r'\('
t_ignore_RPARENT = r'\)'
t_ignore_COMMA = r','
t_ignore_SEMMICOLOM = r';'
t_ignore_DOT = r'\.'
t_ignore_UPDATE = r':='
def t_ID(t):
r'[a-zA-Z_][a-zA-Z_0-9]*'
t.type = reservadas.get(t.value,'ID') # Check for reserved words
return t
def t_newline(t):
r'\n+'
t.lexer.lineno += len(t.value)
#dsfjksdlgjklsdgjsdgslxcvjlk-,.
def t_COMMENT(t):
r'\//.*'
r'\/*.*'
r'\*/.*'
pass
def t_NUMBER(t):
r'\d+'
t.value = int(t.value)
pass
def t_error(t):
print ("----- '%s'" % t.value[0])
t.lexer.skip(1)
while True:
tok = analizador.token()
if not tok : break
print (tok)
the output I get with the above code is:
LexToken(ID,'FSR',1,3)
LexToken(ID,'testing',1,7)
LexToken(ID,'sketch',1,15)
'---- '
'---- '
LexToken(ID,'Connect',3,28)
LexToken(ID,'one',3,36)
LexToken(ID,'end',3,40)
LexToken(ID,'of',3,44)
LexToken(ID,'FSR',3,47)
LexToken(ID,'to',3,51)
LexToken(ID,'V',3,55)
LexToken(ID,'the',3,58)
LexToken(ID,'other',3,62)
LexToken(ID,'end',3,68)
LexToken(ID,'to',3,72)
LexToken(ID,'Analog',3,75)
'---- '
.
.
.
.
LexToken(ID,'Serial',21,694)
LexToken(ID,'print',21,701)
----- '"'
LexToken(ID,'Analog',21,708)
LexToken(ID,'reading',21,715)
----- '"'
'---- '
LexToken(ID,'Serial',22,732)
LexToken(ID,'println',22,739)
LexToken(ID,'fsrReading',22,747)
'---- '
'---- '
LexToken(ID,'LEDbrightness',26,898)
LexToken(ID,'map',26,914)
LexToken(ID,'fsrReading',26,918)
'---- '
LexToken(ID,'analogWrite',28,996)
LexToken(ID,'LEDpin',28,1008)
LexToken(ID,'LEDbrightness',28,1016)
'---- '
LexToken(ID,'IF',29,1034)
'---- '
LexToken(if,'if',30,1038)
'---- '
LexToken(ID,'delay',31,1044)
'---- '
----- '}'
Press any key to continue . . .
my expectation for the exit would be this:
LexToken(ID,'IF',29,1034)
'---- '
LexToken(if,'if',30,1038)
I am analyzing a code of arduino, and all those words are comments, but I only need you to look for the conditionals if or IF, or other reserved words like for, but the main idea is that with a list of reserved words you identify them and show me only those selected
If you want to discard tokens that are not in your 'reserved' list, adjust the t_ID function like so:
def t_ID(t):
r'[a-zA-Z_][a-zA-Z_0-9]*'
reserved_type = reservadas.get(t.value, False)
if reserved_type:
t.type = reserved_type
return t # Return token with reserved type
return None # Discard non-reserved tokens
Additionally, your comment token function is probably misapplied here.
def t_COMMENT(t):
r'\//.*'
r'\/*.*'
r'\*/.*'
pass
You can't use multiple rules or span a rule over multiple strings like this. Because the docstring (which ply uses to get the regex) will only contain the very first string.
Secondly, I think the regex needs adjusting for comments, assuming you're tokenizing C or a C-like language. Particularly, it needs to account for the possibility that comments span multiple lines.
To fix, apply the following for dealing with comments:
def t_block_comment(tok):
r'/\*((.|\n))*?\*/'
tok.lexer.lineno += tok.value.count('\n')
return None # Discard block comments "/* comment */"
t_ignore_comment = r'//.*' # ignore inline comments "// comment"
You may also need to apply the regex multiline flag:
analizador = lex.lex(reflags=re.MULTILINE)
Lastly, your t_ignore_DIVIDE = r'/' may be preventing your comment rules from applying, too. Consider ordering this after the comment rules.

Why this code is not working ? Python

The code is working fine when the text in clipboard has no email address or phone number i.e., when expected result is "Nothing Found"
For other case, it is not working. It is showing error -
AttributeError: 'str' object has no attribute 'matches'
#! python3
# contactDetails.py - Finds email and phone number from a page
import pyperclip, re
phoneRegex = re.compile(r'(\+\d{2}-\d{10})') # Phone Number Regex
# email Regex
emailRegex = re.compile(r'''(
[a-zA-Z0-9._]+ # username
# # # symbol
[a-zA-Z0-9._]+ # domain name
(\.[a-zA-Z]{2,4}])# dot-something
)''', re.VERBOSE)
text = str(pyperclip.paste())
matches = []
for groups in phoneRegex.findall(text):
phoneNum=phoneRegex.findall(text)
matches.append(phoneNum)
for groups in emailRegex.findall(text):
matches.append(groups[0])
if len(matches) >0:
pyperclip.copy('\n'.matches)
print('Copied to Clipboard:')
print('\n'.join(matches))
else:
print('Nothing Found')
As was mentioned in the comment by Wiktor Stribiżew, the problem is in this line
pyperclip.copy('\n'.matches)
In particular, it is here
'\n'.matches
The first item '\n' is a string object, and has no property called matches that can be called. What you want is to do a .join as you had done two lines later i.e.
pyperclip.copy('\n'.join(matches))

Using regular expressions to match a word in Python

I am using PRAW to make a reddit bot that takes the comment author of someone who says "alot" and stores their username into a list. I am having troubles with the regular expression and how to get the string to work. Here is my code.
#importing praw for reddit api and time to make intervals
import praw
import time
import re
username = "LewisTheRobot"
password =
r = praw.Reddit(user_agent = "Counts people who say alot")
word_to_match = ['\balot\b']
storage = []
r.login(username, password)
def run_bot():
subreddit = r.get_subreddit("test")
print("Grabbing subreddit")
comments = subreddit.get_comments(limit=200)
print("Grabbing comments")
for comment in comments:
comment_text = comment.body.lower()
isMatch = any(string in comment_text for string in word_to_match)
if comment.id not in storage and isMatch:
print("Match found! Storing username: " + str(comment.author) + " into list.")
storage.append(comment.author)
print("There are currently: " + str(len(storage)) + " people who use 'alot' instead of ' a lot'.")
while True:
run_bot()
time.sleep(5)
so the regular expression I am using looks for the word alot instead of alot as part of a string. Example zealot. Whenever I run this, it will not find a comment that I have made. Any suggestions?
You're checking with string operations, not RE ones, in
isMatch = any(string in comment_text for string in word_to_match)
The first in here checks for a substring -- nothing to do with REs.
Change this to
isMatch = any(re.search(string, comment_text) for string in word_to_match)
Moreover, you have an error in your initialization:
word_to_match = ['\balot\b']
'\b' is the character with code 0x08 (backspace). Always use raw string syntax for RE patterns, to avoid such traps:
word_to_match = [r'\balot\b']
Now you'll have a couple of characters, backslash then b, which RE will interpret to mean "word boundary".
There may be other bugs but I try not to look for more than two bugs per question...:-)

Python RegEx with word boundaries

I am trying to write a login routine for a python script. In doing so, I find the need to pattern match the credentials on a whole word basis. I have attempted to RegEx this, but it is failing for reasons that are unclear to me, but I hope are obvious to someone here. The code and output:
import re
authentry = "testusertestpass"
username = "testuser"
password = "testpass"
combo = "r\'\\b"+username + password + "\\b\'"
testcred = re.search(combo, authentry)
print combo
print authentry
print testcred
r'\btestusertestpass\b'
testusertestpass
None
So my regex test appears, at least to me, to be properly formatted, and should be a direct match against the test string, but is not. Any ideas? Thanks so much for any insight!
try this: it may works.
import re
authentry = "testusertestpass with another text"
username = "testuser"
password = "testpass"
combo = username + password + r'\b'
testcred = re.search(combo, authentry)
print combo
print authentry
print testcred
output:
testusertestpass\b
testusertestpass with another text
<_sre.SRE_Match object at 0x1b8a030>

Lexer for Parsing to the end of a line

If I have a keyword, how can I get it to, once it encounters a keyword, to just grab the rest of the line and return it as a string? Once it encounters an end of line, return everything on that line.
Here is the line I'm looking at:
description here is the rest of my text to collect
Thus, when the lexer encounters description, I would like "here is the rest of my text to collect" returned as a string
I have the following defined, but it seems to be throwing an error:
states = (
('bcdescription', 'exclusive'),
)
def t_bcdescription(t):
r'description '
t.lexer.code_start = t.lexer.lexpos
t.lexer.level = 1
t.lexer.begin('bcdescription')
def t_bcdescription_close(t):
r'\n'
t.value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos+1]
t.type="BCDESCRIPTION"
t.lexer.lineno += t.valiue.count('\n')
t.lexer.begin('INITIAL')
return t
This is part of the error being returned:
File "/Users/me/Coding/wm/wm_parser/ply/lex.py", line 393, in token
raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:])
ply.lex.LexError: Illegal character ' ' at index 40
Finally, if I wanted this functionality for more than one token, how could I accomplish that?
Thanks for your time
There is no big problem with your code,in fact,i just copy your code and run it,it works well
import ply.lex as lex
states = (
('bcdescription', 'exclusive'),
)
tokens = ("BCDESCRIPTION",)
def t_bcdescription(t):
r'\bdescription\b'
t.lexer.code_start = t.lexer.lexpos
t.lexer.level = 1
t.lexer.begin('bcdescription')
def t_bcdescription_close(t):
r'\n'
t.value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos+1]
t.type="BCDESCRIPTION"
t.lexer.lineno += t.value.count('\n')
t.lexer.begin('INITIAL')
return t
def t_bcdescription_content(t):
r'[^\n]+'
lexer = lex.lex()
data = 'description here is the rest of my text to collect\n'
lexer.input(data)
while True:
tok = lexer.token()
if not tok: break
print tok
and result is :
LexToken(BCDESCRIPTION,' here is the rest of my text to collect\n',1,50)
So maybe your can check other parts of your code
and if I wanted this functionality for more than one token, then you can simply capture words and when there comes a word appears in those tokens, start to capture the rest of content by the code above.
It is not obvious why you need to use a lexer/parser for this without further information.
>>> x = 'description here is the rest of my text to collect'
>>> a, b = x.split(' ', 1)
>>> a
'description'
>>> b
'here is the rest of my text to collect'

Categories

Resources