I have a code in Python to preprocess some text and wrote it into a file.
It removes hashtags, username, symbols and links, stopwords and also gets root of word
import tweepy
import time
import os
import sys
import json
import argparse
import re
from collections import defaultdict
import glob
from nltk.stem.snowball import SnowballStemmer
text = "shit.txt"
def process_text(text=text):
text=re.sub('\\B#[a-zA-Z0-9_]*\\b','',text)
text=re.sub('\\B#[a-zA-Z0-9_]*\\b','',text)
text=re.sub('\\B$[a-zA-Z0-9_]*\\b','',text)
text=re.sub('\\bRT\\b','',text)
text = text.lower()
text = re.sub("(https?://[^ ]+)",'',text)
if text:
a1 = [line.split("-")[0] for line in file("ListOfShortWords.txt")]
a2 = [re.sub("\n",'',line.split("-")[1]).encode("utf-8")for line in file("ListOfShortWords.txt")]
HashList = defaultdict(lambda:"nil")
for c in range(0,len(a1)):
HashList[a1[c]] = a2[c]
text = re.sub(r'([aeiou])\1{2,}', r'\1', text)
text = re.sub(r'([^aeiou])\1{2,}', r'\1\1',text)
text = re.sub(r'(.)\1{2,}\\b', r'\1', text)
for key in HashList.keys():
text = re.sub("\\b"+str(key)+"\\b",str(HashList[key]),text)
for stopword in ['about','above','after','ain\'t','aint','again','against','all','am','an','and','any','are','as','at','be','because','been','before','being','below','between','both','but','by','could','did','do','does','doing','down','during','each','few','for','from','further','had','has','have','having','he','he\'d','he\'ll','he\'s''here''here\'s''hers''herself''him''himself','her','his','how','how\'s','i','i\'d','i\'ll','i\'m','i\'ve','ive','if','in','into','is','it','it\'s','its','itself','let\'s','lets','me','more','most','my','myself','no','nor','not','of','off','on','once','only','or','other','ought','our','ours','ourselves','out','over','own','same','she','she\'d','she\'ll','she\'s','shes','should','so','some','such','than','that','that\'s','thats','the','their','theirs','them','themselves','then','there','there\'s','theres','these','they','they\'d','theyd','they\'ll','they\'re','they\'ve','theyll','theyre','theyve','this','those','through','to','too','under','until','up','very','was','we','we\'d','we\'ll','we\'re','we\'ve','were','what','what\'s','whats','when','when\'s','whens','where','where\'s','wheres','which','while','who','who\'s','whos','whom','why','why\'s','whys','with','won\'t','wont','would','you','you\'d','youd','you\'ll','youll','you\'re','you\'ve','youre','youve','your','yours','yourself','yourselves','\'tis','\'twas','tis','twas']:
text = re.sub("\\b"+stopword+"\\b",'',text)
for ch in ['&','$',',','.','/',':',';','"','{','[','}',']','|','\\','+','=','-','_',')','(','*','^','%','!','~','`','?']:
text = text.replace(ch,' ')
text = re.sub("\\b[0-9]*\\b",'',text)
text = text.replace('\'','')
text = re.sub('\\b[a-z]\\b','',text)
text = re.sub(r'[^\x00-\x7F]+',' ',text)
text = ' '.join(text.split())
return text
for pp in ['pos','neg','neu','irr']:
a = 1
for fil in glob.glob("Senti/"+str(pp)+"/*.txt"):
for line in file(fil):
t = process_text(text=line)
realline=''
for word in t.split():
realline = realline+" "+str(SnowballStemmer("english").stem(word)
with open("Processed Senti/"+str(pp)+"/"+str(a)+".txt", 'w') as outf:
outf.write(realline)
a=a+1
I get an error saying
with open("Processed Senti/"+str(pp)+"/"+str(a)+".txt", 'w') as outf:
^
SyntaxError: invalid syntax
What is wrong with the code? All required folders and files exist
There is a ) missing in the previous line... The str() function is not closed correctly.
I have a script to clean urls to get base domains from example.com/example1 and example.com/example2 down to example.com My issue is when it goes to through the file of urls it will have duplicate base domains. I want to remove the duplicates while printing the urls to a file. below is the code I currently have.
enter from Tkinter import *
import tkFileDialog
import re
def main():
fileOpen = Tk()
fileOpen.withdraw() #hiding tkinter window
file_path = tkFileDialog.askopenfilename(
title="Open file", filetypes=[("txt file",".txt")])
if file_path != "":
print "you chose file with path:", file_path
else:
print "you didn't open anything!"
fin = open(file_path)
fout = open("URL Cleaned.txt", "wt")
for line in fin.readlines():
editor = (line.replace('[.]', '.')
.replace('[dot]', '.')
.replace('hxxp://www.', '')
.replace('hxxps://www.', '')
.replace('hxxps://', '')
.replace('hxxp://', '')
.replace('www.', '')
.replace('http://www.', '')
.replace('https://www.', '')
.replace('https://', '')
.replace('http://', ''))
editor = re.sub(r'/.*', '', editor)
if __name__ == '__main__':
main()
Any help is appreciated. I have scoured the posts and tried all of the suggestions for my issue and have not found one that works.
You can use regular expresion to find the base domains.
If you have one url per line in your file:
import re
def main():
file = open("url.txt",'r')
domains = set()
# will works for any web like https://www.domain.com/something/somethingmore... , also without www, without https or just for www.domain.org
matcher= re.compile("(h..ps?://)?(?P<domain>(www\.)?[^/]*)/?.*")
for line in file:
# make here any replace you need with obfuscated urls like: line = line.replace('[.]','.')
if line[-1] == '\n': # remove "\n" from end of line if present
line = line[0:-1]
match = matcher.search(line)
if match != None: # If a url has been found
domains.add(match.group('domain'))
print domains
file.close()
main()
For example, with this file, it will print:
set(['platinum-shakers.net', 'wmi.ns01.us', 'adservice.no-ip.org', 'samczeruno.pl', 'java.ns1.name', 'microsoft.dhcp.biz', 'ids.us01.us', 'devsite.quostar.com', 'orlandmart.com'])
perhaps you could use a regular expression:
import re
p = re.compile(r".*\.com/(.*)") # to get for instance 'example1' or 'example2' etc.
with open(file_path) as fin, open("URL Cleaned.txt", "wt") as fout:
lines = fin.readlines():
bases = set(re.search(p, line).groups()[0] for line in lines if len(line) > 1)
for b in bases:
fout.write(b)
Using with open(..) auto closes the files after the executing the block of code
Output:
Using a text file with:
www.example.com/example1
www.example.com/example2
# blank lines are accounted for
www.example.com/example3
www.example.com/example4
www.example.com/example4 # as are duplicates
as the lines, I got the output,
example1
example2
example3
example4
I'm looking to do the equivalent of _grep -B14 MMA
I have a URL that I open and it spits out many lines.
I want to
find the line that has 'MMa'
then print the 14th line before it
I don't even know where to begin with this.
import urllib
import urllib2
url = "https://longannoyingurl.com"
opts = {
'action': 'Dump+It'
}
data = urllib.urlencode(opts)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
print response.read() # gives the full html output
Instead of just doing a bare read on the response object, call readlines instead, and then run a regular expression through each line. If the line matches, print the 14th line before it, but check to see that you're not negative indexing. E.g.
import re
lines = response.readlines()
r = re.compile(r'MMa')
for i in range(len(lines)):
if r.search(lines[i]):
print lines[max(0, i-14)]
thanks to Dan I got my result
import urllib
import urllib2
import re
url="https://somelongannoyingurl/blah/servlet"
opts = {
'authid': 'someID',
'action': 'Dump+It'
}
data = urllib.urlencode(opts)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
lines = response.readlines()
r = re.compile(r'MMa')
for i in range(len(lines)):
if r.search(lines[i]):
line = lines[max(0, i-14)].strip()
junk,mma = line.split('>')
print mma.strip()
~
You can split a single string into a list of lines using mystr.splitlines(). You can test if a string matches a regular expression using re.match(). Once you find the matching line(s), you can index backwards into your list of lines to find the 14th line before.