Import Error: No Module named 'xxxx' Specific - python

Basically what I'm trying to do is make a program in python which takes a URL, copys the source, and pulls all comments out and presents them to the user.
import urllib2
import html2text
import PullsCommentsOut.pullscommentsout
url = raw_input('Please input URL with the text you want to analyze: ')
page = urllib2.urlopen(url)
html_content = page.read().decode('utf8')
rendered_content = html2text.html2text(html_content).encode('ascii',
'ignore')
f = open('file_text.txt', 'wb')
f.write(rendered_content)
f.close()
result = PullsCommentsOut.pullscommentsout(html_content)
print result
And my second file, 'PullsCommentsOut'
import re
def pullscommentsout():
def comment_remover(text):
def replacer(match):
s = match.group(0)
if s.startswith('/'):
print s
return " " # note: a space and not an empty string
else:
return s
pattern = re.compile(
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE
)
return re.sub(pattern, replacer, text)
fd = open("test.c", "r")
buf = fd.read()
comment_remover(buf)
For the life of me I can't figure out why Python doesn't think I'm not importing the proper module? It doesn't make sense.
I need to add more text so it allows me to post, so, how are you all doing? I'm doing pretty good I guess. No complaints.

Related

Unique string before writing to file - python

Why can't this work? I want to unique the results I get from the Rest api before I write it to the file --
MISP_HOST="https://192.168.1.8"
API_KEY="asdfasdfas"
EXPORT_DATA="attributes/text/download/md5"
OUTPUT_FILE="md5-"+today
def main():
URL="%s/%s" % (MISP_HOST, EXPORT_DATA)
request = urllib2.Request(URL)
f = open(OUTPUT_FILE,'w')
request.add_header('Authorization', API_KEY)
data = urllib2.urlopen(request).read()
set(data)
print type(data)
f.write(data)
f.close()
It work with no errors but the data is definitely not unique. I'm trying not to do this in bash. Could you explain the why it doesn't work too? Many thanks!!!
If your result is plain text, you can use a regular expression to find all of the words in the text and then build a set from there. This example also lower cases the words so that the set is case insensitive and writes each word on its own line.
import re
MISP_HOST="https://192.168.1.8"
API_KEY="asdfasdfas"
EXPORT_DATA="attributes/text/download/md5"
OUTPUT_FILE="md5-"+today
def main():
URL="%s/%s" % (MISP_HOST, EXPORT_DATA)
request = urllib2.Request(URL)
f = open(OUTPUT_FILE,'w')
request.add_header('Authorization', API_KEY)
data = urllib2.urlopen(request).read()
unique = set(word.lower() for word in re.findall(r'\w+', data))
# that could be expanded to
# wordlist = re.findall(r'\w+', data)
# unique = set(word.lower() for word in wordlist)
print type(unique)
f.write('\n'.join(unique))
f.close()

Error in WIth loop in Python

I have a code in Python to preprocess some text and wrote it into a file.
It removes hashtags, username, symbols and links, stopwords and also gets root of word
import tweepy
import time
import os
import sys
import json
import argparse
import re
from collections import defaultdict
import glob
from nltk.stem.snowball import SnowballStemmer
text = "shit.txt"
def process_text(text=text):
text=re.sub('\\B#[a-zA-Z0-9_]*\\b','',text)
text=re.sub('\\B#[a-zA-Z0-9_]*\\b','',text)
text=re.sub('\\B$[a-zA-Z0-9_]*\\b','',text)
text=re.sub('\\bRT\\b','',text)
text = text.lower()
text = re.sub("(https?://[^ ]+)",'',text)
if text:
a1 = [line.split("-")[0] for line in file("ListOfShortWords.txt")]
a2 = [re.sub("\n",'',line.split("-")[1]).encode("utf-8")for line in file("ListOfShortWords.txt")]
HashList = defaultdict(lambda:"nil")
for c in range(0,len(a1)):
HashList[a1[c]] = a2[c]
text = re.sub(r'([aeiou])\1{2,}', r'\1', text)
text = re.sub(r'([^aeiou])\1{2,}', r'\1\1',text)
text = re.sub(r'(.)\1{2,}\\b', r'\1', text)
for key in HashList.keys():
text = re.sub("\\b"+str(key)+"\\b",str(HashList[key]),text)
for stopword in ['about','above','after','ain\'t','aint','again','against','all','am','an','and','any','are','as','at','be','because','been','before','being','below','between','both','but','by','could','did','do','does','doing','down','during','each','few','for','from','further','had','has','have','having','he','he\'d','he\'ll','he\'s''here''here\'s''hers''herself''him''himself','her','his','how','how\'s','i','i\'d','i\'ll','i\'m','i\'ve','ive','if','in','into','is','it','it\'s','its','itself','let\'s','lets','me','more','most','my','myself','no','nor','not','of','off','on','once','only','or','other','ought','our','ours','ourselves','out','over','own','same','she','she\'d','she\'ll','she\'s','shes','should','so','some','such','than','that','that\'s','thats','the','their','theirs','them','themselves','then','there','there\'s','theres','these','they','they\'d','theyd','they\'ll','they\'re','they\'ve','theyll','theyre','theyve','this','those','through','to','too','under','until','up','very','was','we','we\'d','we\'ll','we\'re','we\'ve','were','what','what\'s','whats','when','when\'s','whens','where','where\'s','wheres','which','while','who','who\'s','whos','whom','why','why\'s','whys','with','won\'t','wont','would','you','you\'d','youd','you\'ll','youll','you\'re','you\'ve','youre','youve','your','yours','yourself','yourselves','\'tis','\'twas','tis','twas']:
text = re.sub("\\b"+stopword+"\\b",'',text)
for ch in ['&','$',',','.','/',':',';','"','{','[','}',']','|','\\','+','=','-','_',')','(','*','^','%','!','~','`','?']:
text = text.replace(ch,' ')
text = re.sub("\\b[0-9]*\\b",'',text)
text = text.replace('\'','')
text = re.sub('\\b[a-z]\\b','',text)
text = re.sub(r'[^\x00-\x7F]+',' ',text)
text = ' '.join(text.split())
return text
for pp in ['pos','neg','neu','irr']:
a = 1
for fil in glob.glob("Senti/"+str(pp)+"/*.txt"):
for line in file(fil):
t = process_text(text=line)
realline=''
for word in t.split():
realline = realline+" "+str(SnowballStemmer("english").stem(word)
with open("Processed Senti/"+str(pp)+"/"+str(a)+".txt", 'w') as outf:
outf.write(realline)
a=a+1
I get an error saying
with open("Processed Senti/"+str(pp)+"/"+str(a)+".txt", 'w') as outf:
^
SyntaxError: invalid syntax
What is wrong with the code? All required folders and files exist
There is a ) missing in the previous line... The str() function is not closed correctly.

Remove duplicates after altering items

I have a script to clean urls to get base domains from example.com/example1 and example.com/example2 down to example.com My issue is when it goes to through the file of urls it will have duplicate base domains. I want to remove the duplicates while printing the urls to a file. below is the code I currently have.
enter from Tkinter import *
import tkFileDialog
import re
def main():
fileOpen = Tk()
fileOpen.withdraw() #hiding tkinter window
file_path = tkFileDialog.askopenfilename(
title="Open file", filetypes=[("txt file",".txt")])
if file_path != "":
print "you chose file with path:", file_path
else:
print "you didn't open anything!"
fin = open(file_path)
fout = open("URL Cleaned.txt", "wt")
for line in fin.readlines():
editor = (line.replace('[.]', '.')
.replace('[dot]', '.')
.replace('hxxp://www.', '')
.replace('hxxps://www.', '')
.replace('hxxps://', '')
.replace('hxxp://', '')
.replace('www.', '')
.replace('http://www.', '')
.replace('https://www.', '')
.replace('https://', '')
.replace('http://', ''))
editor = re.sub(r'/.*', '', editor)
if __name__ == '__main__':
main()
Any help is appreciated. I have scoured the posts and tried all of the suggestions for my issue and have not found one that works.
You can use regular expresion to find the base domains.
If you have one url per line in your file:
import re
def main():
file = open("url.txt",'r')
domains = set()
# will works for any web like https://www.domain.com/something/somethingmore... , also without www, without https or just for www.domain.org
matcher= re.compile("(h..ps?://)?(?P<domain>(www\.)?[^/]*)/?.*")
for line in file:
# make here any replace you need with obfuscated urls like: line = line.replace('[.]','.')
if line[-1] == '\n': # remove "\n" from end of line if present
line = line[0:-1]
match = matcher.search(line)
if match != None: # If a url has been found
domains.add(match.group('domain'))
print domains
file.close()
main()
For example, with this file, it will print:
set(['platinum-shakers.net', 'wmi.ns01.us', 'adservice.no-ip.org', 'samczeruno.pl', 'java.ns1.name', 'microsoft.dhcp.biz', 'ids.us01.us', 'devsite.quostar.com', 'orlandmart.com'])
perhaps you could use a regular expression:
import re
p = re.compile(r".*\.com/(.*)") # to get for instance 'example1' or 'example2' etc.
with open(file_path) as fin, open("URL Cleaned.txt", "wt") as fout:
lines = fin.readlines():
bases = set(re.search(p, line).groups()[0] for line in lines if len(line) > 1)
for b in bases:
fout.write(b)
Using with open(..) auto closes the files after the executing the block of code
Output:
Using a text file with:
www.example.com/example1
www.example.com/example2
# blank lines are accounted for
www.example.com/example3
www.example.com/example4
www.example.com/example4 # as are duplicates
as the lines, I got the output,
example1
example2
example3
example4

Write delimited strings from an infile to an outfile

The purpose is to search an infile (html) and reproduce the URLs of any images in an outfile which could be passed to wget. This would be the first useful thing I've written in Python and it appears to work well on Fedora. I couldn't find anything specifically like this anywhere. Does anyone have suggestions for improving on this?
import fileinput
import re
#replace 'output.txt' with the name of your outfile
file = open('output.txt', 'w')
#prefix and postfix are how we discriminate your substring from the infile's line
prefix = '<img src='
postfix = '.jpg'
#read through the infile line-by-line
for line in fileinput.input():
if re.search(prefix, line):
#from if above, if you find the prefix, assign the integer to first_index
first_index = line.index(prefix)
if re.search(postfix, line):
#same as comment above, but for postfix
second_index = line.index(postfix)
#write your string plus an newline to the outfile
file.write(line[first_index+prefix.__len__():second_index+postfix.__len__()]+'\n')
I've done something like this in the past and it worked pretty well... I'm sure it will be more accurate than trying to parse with regex.
from HTMLParser import HTMLParser
class ImageFinder(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.file = open('output.txt', 'w')
def handle_starttag(self, tag, attrs):
if tag == "img":
url = [u[1] for u in attrs if u[0] == "src"][0]
self.file.write(url+"\n")
def __exit__(self):
self.file.close()
inputdata = open("myfile.txt").read()
parser = ImageFinder()
parser.feed(inputdata)

python grep look for a pattern and then a number of lines before

I'm looking to do the equivalent of _grep -B14 MMA
I have a URL that I open and it spits out many lines.
I want to
find the line that has 'MMa'
then print the 14th line before it
I don't even know where to begin with this.
import urllib
import urllib2
url = "https://longannoyingurl.com"
opts = {
'action': 'Dump+It'
}
data = urllib.urlencode(opts)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
print response.read() # gives the full html output
Instead of just doing a bare read on the response object, call readlines instead, and then run a regular expression through each line. If the line matches, print the 14th line before it, but check to see that you're not negative indexing. E.g.
import re
lines = response.readlines()
r = re.compile(r'MMa')
for i in range(len(lines)):
if r.search(lines[i]):
print lines[max(0, i-14)]
thanks to Dan I got my result
import urllib
import urllib2
import re
url="https://somelongannoyingurl/blah/servlet"
opts = {
'authid': 'someID',
'action': 'Dump+It'
}
data = urllib.urlencode(opts)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
lines = response.readlines()
r = re.compile(r'MMa')
for i in range(len(lines)):
if r.search(lines[i]):
line = lines[max(0, i-14)].strip()
junk,mma = line.split('>')
print mma.strip()
~
You can split a single string into a list of lines using mystr.splitlines(). You can test if a string matches a regular expression using re.match(). Once you find the matching line(s), you can index backwards into your list of lines to find the 14th line before.

Categories

Resources