Python Identifier Identification - python

I'm reading a Python file in a Python program and I want to get the list of all identifiers, literals, separators and terminator in the Python file being read. Using identifiers as example:
one_var = "something"
two_var = "something else"
other_var = "something different"
Assuming the variables above are in the file being read, the result should be:
list_of_identifiers = [one_var, two_var, other_var]
Same thing goes for literals, terminators and separators. Thanks
I already wrote code for all operators and keywords:
import keyword, operator
list_of_operators = []
list_of_keywords = []
more_operators = ['+', '-', '/', '*', '%', '**', '//', '==', '!=', '>', '<', '>=', '<=', '=', '+=', '-=', '*=', '/=', '%=', '**=', '//=', '&', '|', '^', '~', '<<', '>>', 'in', 'not in', 'is', 'is not', 'not', 'or', 'and']
with open('file.py') as data_source:
for each_line in data_source:
new_string = str(each_line).split(' ')
for each_word in new_string:
if each_word in keyword.kwlist:
list_of_keywords.append(each_word)
elif each_word in operator.__all__ or each_word in more_operators:
list_of_operators.append(each_word)
print("Operators found:\n", list_of_operators)
print("Keywords found:\n", list_of_keywords)

import ast
with open('file.py') as data_source:
ast_root = ast.parse(data_source.read())
identifiers = set()
for node in ast.walk(ast_root):
if isinstance(node, ast.Name):
identifiers.add(node.id)
print(identifiers)

Related

Select all string in list and append a string - python

I wanna add "." to the end of any item of doing variable.
and I want output like this => I am watching.
import random
main = ['I', 'You', 'He', 'She']
main0 = ['am', 'are', 'is', 'is']
doing = ['playing', 'watching', 'reading', 'listening']
rd = random.choice(main)
rd0 = random.choice(doing)
result = []
'''
if rd == main[0]:
result.append(rd)
result.append(main0[0])
if rd == main[1]:
result.append(rd)
result.append(main0[1])
if rd == main[2]:
result.append(rd)
result.append(main0[2])
if rd == main[3]:
result.append(rd)
result.append(main0[3])
'''
result.append(rd0)
print(result)
well, I tried those codes.
'.'.append(doing)
'.'.append(doing[0])
'.'.append(rd0)
but no one of them works, and only returns an error that:
Traceback (most recent call last):
File "c:\Users\----\Documents\Codes\Testing\s.py", line 21, in <module>
'.'.append(rd0)
AttributeError: 'str' object has no attribute 'append'
Why not just select a random string from each list and then build the output as you want:
import random
main = ['I', 'You', 'He', 'She']
verb = ['am', 'are', 'is', 'is']
doing = ['playing', 'watching', 'reading', 'listening']
p1 = random.choice(main)
p2 = random.choice(verb)
p3 = random.choice(doing)
output = ' '.join([p1, p2, p3]) + '.'
print(output) # You is playing.
Bad English, but the logic seems to be what you want here.
Just add a period after making the string:
import random
main = ['I', 'You', 'He', 'She']
main0 = ['am', 'are', 'is', 'is']
doing = ['playing', 'watching', 'reading', 'listening']
' '.join([random.choice(i) for i in [main, main0, doing]]) + '.'

tokenizing: how to not tokenize punctuation like `^* in python for NLP

I want to tokenize string punctuation except `*^
I've tried but the result, all types of punctuation are separated, while for some punctuation I don't want to separate
when i use:
text = "hai*ini^ema`il saya lunar!?"
tokenizer = TweetTokenizer()
nltk_tokens = tokenizer.tokenize(text)
nltk_tokens
i get:
['hai', '*', 'ini', '^', 'ema', '`', 'il', 'saya', 'lunar', '!', '?']
what i want is:
['hai*ini^ema`il', 'saya', 'lunar', '!', '?']
I want to tokenize but not tokenize *^`
Try this:
def phrasalize(tokens):
s = " ".join(tokens)
match = re.match("((\w*\s[\*\^\`]\s\w*)+)", s)
while match:
s = s.replace(match.group(1), match.group(1).replace(' ', ''))
match = re.match("((\w*\s[\*\^\`]\s\w*)+)", s)
return s
tokens = ['hai', '*', 'ini', '^', 'ema', '`', 'il', 'saya', 'lunar', '!', '?']
phrasalize(tokens)
[out]:
'hai*ini^ema`il saya lunar ! ?'

AttributeError: 'list' object has no attribute 'isdigit'. Specifying POS of each and every word in sentences list efficiently?

Suppose I am having lists of list of sentences (in a large corpus) as collections of tokenized words. The sample format is as follows:
The format of tokenized_raw_data is as follows:
[['arxiv', ':', 'astro-ph/9505066', '.'], ['seds', 'page', 'on', '``',
'globular', 'star', 'clusters', "''", 'douglas', 'scott', '``', 'independent',
'age', 'estimates', "''", 'krysstal', '``', 'the', 'scale', 'of', 'the',
'universe', "''", 'space', 'and', 'time', 'scaled', 'for', 'the', 'beginner',
'.'], ['icosmos', ':', 'cosmology', 'calculator', '(', 'with', 'graph',
'generation', ')', 'the', 'expanding', 'universe', '(', 'american',
'institute', 'of', 'physics', ')']]
I want to apply the pos_tag.
What I have tried up to now is as follows.
import os, nltk, re
from nltk.corpus import stopwords
from unidecode import unidecode
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
def read_data():
global tokenized_raw_data
with open("path//merge_text_results_pu.txt", 'r', encoding='utf-8', errors = 'replace') as f:
raw_data = f.read()
tokenized_raw_data = '\n'.join(nltk.line_tokenize(raw_data))
read_data()
def function1():
tokens_sentences = sent_tokenize(tokenized_raw_data.lower())
unfiltered_tokens = [[word for word in word_tokenize(word)] for word in tokens_sentences]
tagged_tokens = nltk.pos_tag(unfiltered_tokens)
nouns = [word.encode('utf-8') for word,pos in tagged_tokens
if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
joined_nouns_text = (' '.join(map(bytes.decode, nouns))).strip()
noun_tokens = [t for t in wordpunct_tokenize(joined_nouns_text)]
stop_words = set(stopwords.words("english"))
function1()
I am getting the following error.
> AttributeError: 'list' object has no attribute 'isdigit'
Please help how to overcome this error in time-efficient manner? Where I am going wrong?
Note: I am using Python 3.7 on Windows 10.
Try this-
word_list=[]
for i in range(len(unfiltered_tokens)):
word_list.append([])
for i in range(len(unfiltered_tokens)):
for word in unfiltered_tokens[i]:
if word[1:].isalpha():
word_list[i].append(word[1:])
then after do
tagged_tokens=[]
for token in word_list:
tagged_tokens.append(nltk.pos_tag(token))
You will get your desired results! Hope this helped.

how to select specific words and put them into tuple - list?

I got a result of a long string through using BeautifulSoup.
It is shaped something like this:
<span>title1</span>
<span>title2</span>
<span>title3</span>
<span>title4</span>
I want to specifically select "link#" and "title" and put them in a list - tuple like the one below:
[(link1,title1),(link2,title2),(link3,title3),(link4,title4)]
Due to my lack of understandings in python,
I don't even know what to search for.
I've been trying to do this for like 6 hours and still couldn't find the way.
the bs code i used
def extract(self):
self.url ="http://aetoys.tumblr.com"
self.source = requests.get(self.url)
self.text = self.source.text
self.soup = BeautifulSoup(self.text)
for self.div in self.soup.findAll('li',{'class':'has-sub'}):
for self.li in self.div.find_all('a'):
print(self.li)
You just need to extract the href:
out = [] # store lists of lists
for self.div in self.soup.findAll('li',{'class':'has-sub'}):
out.append([x["href"] for x in self.div.find_all('a',href=True)])
print([x["href"] for x in self.div.find_all('a',href=True)])
['#', '#', '/onepiece_book', '/onepiece', '#', '/naruto_book', '/naruto', '#', '/bleach_book', '/bleach', '/kingdom', '/tera', '/torico', '/titan', '/seven', '/fairytail', '/soma', '/amsal', '/berserk', '/ghoul', '/kaizi', '/piando']
['#', '/onepiece_book', '/onepiece']
['#', '/naruto_book', '/naruto']
['#', '/bleach_book', '/bleach']
['#', '/conan', '/silver', '/hai', '/nise', '/hunterbyhunter', '/baku', '/unhon', '/souleater', '/liargame', '/kenichi', '/dglayman', '/magi', '/suicide', '/pedal']
['#', '/dobaku', '/gisei', '/dragonball', '/hagaren', '/gantz', '/doctor', '/dunk', '/susi', '/reborn', '/airgear', '/island', '/crows', '/beelzebub', '/zzang', '/akira', '/tennis', '/kuroco', '/claymore', '/deathnote']
To get a single list:
url ="http://aetoys.tumblr.com"
source = requests.get(url)
text = source.text
soup = BeautifulSoup(text)
print [ x["href"] for div in soup.findAll('li',{'class':'has-sub'}) for x in div.find_all('a',href=True)]
['#', '#', '/onepiece_book', '/onepiece', '#', '/naruto_book', '/naruto', '#', '/bleach_book', '/bleach', '/kingdom', '/tera', '/torico', '/titan', '/seven', '/fairytail', '/soma', '/amsal', '/berserk', '/ghoul', '/kaizi', '/piando', '#', '/onepiece_book', '/onepiece', '#', '/naruto_book', '/naruto', '#', '/bleach_book', '/bleach', '#', '/conan', '/silver', '/hai', '/nise', '/hunterbyhunter', '/baku', '/unhon', '/souleater', '/liargame', '/kenichi', '/dglayman', '/magi', '/suicide', '/pedal', '#', '/dobaku', '/gisei', '/dragonball', '/hagaren', '/gantz', '/doctor', '/dunk', '/susi', '/reborn', '/airgear', '/island', '/crows', '/beelzebub', '/zzang', '/akira', '/tennis', '/kuroco', '/claymore', '/deathnote']
If you really want tuples:
out = []
for div in soup.findAll('li',{'class':'has-sub'}):
out.append(tuple(x["href"] for x in div.find_all('a',href=True)))

Adding Syntax Highlighting to a Text Editor with Python

I am creating my own web-based text editor and I want to add in syntax highlighting. To start off I will on do highlighting for one language (Python, most likely), but later on I want to add highlighting for every language I can think of.
I would like to find a tutorial on this if possible, does anyone know of a place where I can find one
Also, if there are any other tips you can give me, that would be great.
Take a look at Pygments.
highlight.js
# syntax.py
import sys
from PyQt4.QtCore import QRegExp
from PyQt4.QtGui import QColor, QTextCharFormat, QFont, QSyntaxHighlighter
def format(color, style=''):
"""Return a QTextCharFormat with the given attributes.
"""
_color = QColor()
_color.setNamedColor(color)
_format = QTextCharFormat()
_format.setForeground(_color)
if 'bold' in style:
_format.setFontWeight(QFont.Bold)
if 'italic' in style:
_format.setFontItalic(True)
return _format
# Syntax styles that can be shared by all languages
STYLES = {
'keyword': format('blue'),
'operator': format('red'),
'brace': format('darkGray'),
'defclass': format('black', 'bold'),
'string': format('magenta'),
'string2': format('darkMagenta'),
'comment': format('darkGreen', 'italic'),
'self': format('black', 'italic'),
'numbers': format('brown'),
}
class PythonHighlighter (QSyntaxHighlighter):
"""Syntax highlighter for the Python language.
"""
# Python keywords
keywords = [
'and', 'assert', 'break', 'class', 'continue', 'def',
'del', 'elif', 'else', 'except', 'exec', 'finally',
'for', 'from', 'global', 'if', 'import', 'in',
'is', 'lambda', 'not', 'or', 'pass', 'print',
'raise', 'return', 'try', 'while', 'yield',
'None', 'True', 'False',
]
# Python operators
operators = [
'=',
# Comparison
'==', '!=', '<', '<=', '>', '>=',
# Arithmetic
'\+', '-', '\*', '/', '//', '\%', '\*\*',
# In-place
'\+=', '-=', '\*=', '/=', '\%=',
# Bitwise
'\^', '\|', '\&', '\~', '>>', '<<',
]
# Python braces
braces = [
'\{', '\}', '\(', '\)', '\[', '\]',
]
def __init__(self, document):
QSyntaxHighlighter.__init__(self, document)
# Multi-line strings (expression, flag, style)
# FIXME: The triple-quotes in these two lines will mess up the
# syntax highlighting from this point onward
self.tri_single = (QRegExp("'''"), 1, STYLES['string2'])
self.tri_double = (QRegExp('"""'), 2, STYLES['string2'])
rules = []
# Keyword, operator, and brace rules
rules += [(r'\b%s\b' % w, 0, STYLES['keyword'])
for w in PythonHighlighter.keywords]
rules += [(r'%s' % o, 0, STYLES['operator'])
for o in PythonHighlighter.operators]
rules += [(r'%s' % b, 0, STYLES['brace'])
for b in PythonHighlighter.braces]
# All other rules
rules += [
# 'self'
(r'\bself\b', 0, STYLES['self']),
# Double-quoted string, possibly containing escape sequences
(r'"[^"\\]*(\\.[^"\\]*)*"', 0, STYLES['string']),
# Single-quoted string, possibly containing escape sequences
(r"'[^'\\]*(\\.[^'\\]*)*'", 0, STYLES['string']),
# 'def' followed by an identifier
(r'\bdef\b\s*(\w+)', 1, STYLES['defclass']),
# 'class' followed by an identifier
(r'\bclass\b\s*(\w+)', 1, STYLES['defclass']),
# From '#' until a newline
(r'#[^\n]*', 0, STYLES['comment']),
# Numeric literals
(r'\b[+-]?[0-9]+[lL]?\b', 0, STYLES['numbers']),
(r'\b[+-]?0[xX][0-9A-Fa-f]+[lL]?\b', 0, STYLES['numbers']),
(r'\b[+-]?[0-9]+(?:\.[0-9]+)?(?:[eE][+-]?[0-9]+)?\b', 0, STYLES['numbers']),
]
# Build a QRegExp for each pattern
self.rules = [(QRegExp(pat), index, fmt)
for (pat, index, fmt) in rules]
def highlightBlock(self, text):
"""Apply syntax highlighting to the given block of text.
"""
# Do other syntax formatting
for expression, nth, format in self.rules:
index = expression.indexIn(text, 0)
while index >= 0:
# We actually want the index of the nth match
index = expression.pos(nth)
length = expression.cap(nth).length()
self.setFormat(index, length, format)
index = expression.indexIn(text, index + length)
self.setCurrentBlockState(0)
# Do multi-line strings
in_multiline = self.match_multiline(text, *self.tri_single)
if not in_multiline:
in_multiline = self.match_multiline(text, *self.tri_double)
def match_multiline(self, text, delimiter, in_state, style):
"""Do highlighting of multi-line strings. ``delimiter`` should be a
``QRegExp`` for triple-single-quotes or triple-double-quotes, and
``in_state`` should be a unique integer to represent the corresponding
state changes when inside those strings. Returns True if we're still
inside a multi-line string when this function is finished.
"""
# If inside triple-single quotes, start at 0
if self.previousBlockState() == in_state:
start = 0
add = 0
# Otherwise, look for the delimiter on this line
else:
start = delimiter.indexIn(text)
# Move past this match
add = delimiter.matchedLength()
# As long as there's a delimiter match on this line...
while start >= 0:
# Look for the ending delimiter
end = delimiter.indexIn(text, start + add)
# Ending delimiter on this line?
if end >= add:
length = end - start + add + delimiter.matchedLength()
self.setCurrentBlockState(0)
# No; multi-line string
else:
self.setCurrentBlockState(in_state)
length = text.length() - start + add
# Apply formatting
self.setFormat(start, length, style)
# Look for the next match
start = delimiter.indexIn(text, start + length)
# Return True if still inside a multi-line string, False otherwise
if self.currentBlockState() == in_state:
return True
else:
return False
from PyQt4 import QtGui
import syntax
app = QtGui.QApplication([])
texter = QtGui.QPlainTextEdit()
highlight = syntax.PythonHighlighter(texter.document())
texter.show()
infile = open('syntax.py', 'r')
texter.setPlainText(infile.read())
app.exec_()
#use the program syntax.py to make it work I posted it

Categories

Resources