Why can't this work? I want to unique the results I get from the Rest api before I write it to the file --
MISP_HOST="https://192.168.1.8"
API_KEY="asdfasdfas"
EXPORT_DATA="attributes/text/download/md5"
OUTPUT_FILE="md5-"+today
def main():
URL="%s/%s" % (MISP_HOST, EXPORT_DATA)
request = urllib2.Request(URL)
f = open(OUTPUT_FILE,'w')
request.add_header('Authorization', API_KEY)
data = urllib2.urlopen(request).read()
set(data)
print type(data)
f.write(data)
f.close()
It work with no errors but the data is definitely not unique. I'm trying not to do this in bash. Could you explain the why it doesn't work too? Many thanks!!!
If your result is plain text, you can use a regular expression to find all of the words in the text and then build a set from there. This example also lower cases the words so that the set is case insensitive and writes each word on its own line.
import re
MISP_HOST="https://192.168.1.8"
API_KEY="asdfasdfas"
EXPORT_DATA="attributes/text/download/md5"
OUTPUT_FILE="md5-"+today
def main():
URL="%s/%s" % (MISP_HOST, EXPORT_DATA)
request = urllib2.Request(URL)
f = open(OUTPUT_FILE,'w')
request.add_header('Authorization', API_KEY)
data = urllib2.urlopen(request).read()
unique = set(word.lower() for word in re.findall(r'\w+', data))
# that could be expanded to
# wordlist = re.findall(r'\w+', data)
# unique = set(word.lower() for word in wordlist)
print type(unique)
f.write('\n'.join(unique))
f.close()
Related
I am attempting to loop through a series of text files in a directory, looking for occurences of certain types of words, and prefixing each found word with a user defined tag. My code is as follows.
ACC_Tagged_Test = 'C:/ACC_Tag_Test'
for filename in glob.glob(os.path.join(ACC_Tagged_Test, '*.txt')):
with open(filename) as f:
data = f.read()
data = data.lower()
modals = {"could":1, "would":1, "should":1, "can":1, "may":1, "might":1}
personal_attribute = {"believes":1, "guess":1, "surmise":1, "considers":1,
"presume":1, "speculate":1, "postulate":1, "surmised":1, "assume":1}
approx_adapt = {"broadly":1, "mainly":1, "mostly":1, "loosely":1,
"generally":1, "usually":1,"typically":1, "regularly":1, "widely":1}
plaus_shields = {"wonder":1, "suspect":1, "theorize":1, "hypothesize":1,
"cogitate":1, "contemplate":1, "deliberate":1}
format_modal = "<555>{} ".format
format_attribute = "<666>{} ".format
format_app_adaptor = "<777>{} ".format
format_plaus_shield = "<888>{} ".format
data = " ".join(format_modal(word) if word in modals else word for word in data.split())
data = " ".join(format_attribute(word) if word in personal_attribute else word for word in data.split())
data = " ".join(format_app_adaptor(word) if word in approx_adapt else word for word in data.split())
data = " ".join(format_plaus_shield(word) if word in plaus_shields else word for word in data.split())
with open (filename, "w") as f:
f.write(str(data))
print(data) # This is just added in order to check on screen all files
# Are being processed.
My problem is that although code works on the last file in the directory it is not working on the previous files (1 out of 10 in this) I've tried a second For loop above the file write out statements but that is not working at all. Can anyone explain what I'm doing wrong here?
regards
My speculation is your code is only showing the last file because it's
not indented properly to have all relevant code within the for loop.
Try with this indentation:
ACC_Tagged_Test = 'C:/ACC_Tag_Test'
for filename in glob.glob(os.path.join(ACC_Tagged_Test, '*.txt')):
with open(filename) as f:
data = f.read()
data = data.lower()
modals = {"could":1, "would":1, "should":1, "can":1, "may":1, "might":1}
personal_attribute = {"believes":1, "guess":1, "surmise":1, "considers":1,
"presume":1, "speculate":1, "postulate":1, "surmised":1, "assume":1}
approx_adapt = {"broadly":1, "mainly":1, "mostly":1, "loosely":1,
"generally":1, "usually":1,"typically":1, "regularly":1, "widely":1}
plaus_shields = {"wonder":1, "suspect":1, "theorize":1, "hypothesize":1,
"cogitate":1, "contemplate":1, "deliberate":1}
format_modal = "<555>{} ".format
format_attribute = "<666>{} ".format
format_app_adaptor = "<777>{} ".format
format_plaus_shield = "<888>{} ".format
data = " ".join(format_modal(word) if word in modals else word for word in data.split())
data = " ".join(format_attribute(word) if word in personal_attribute else word for word in data.split())
data = " ".join(format_app_adaptor(word) if word in approx_adapt else word for word in data.split())
data = " ".join(format_plaus_shield(word) if word in plaus_shields else word for word in data.split())
with open (filename, "w") as f:
f.write(str(data))
print(data) # This is just added in order to check on screen all files
# Are being processed.
Assuming all of your code is supposed to be in your for loop. You are overriding your text file, therefore it looks like only your last run is working:
#this overrides the file
with open(filename, "w") as fh:
fh.write(str(data))
change to:
#this append to the file
with open(filename, "a") as fh:
fh.write(str(data))
This will append to your text file and will not override previous added data with the data from the last loop.
I am writing a Python code to extract all the URLs from an input file, having content or text from Twitter (Tweets). However, while doing so I realized that several URLs that were extracted in the python list had 'special characters' or 'Punctuation' towards the end, because of which I could not further parse through them to get the base URL link. My Question is: 'How do I identify & remove special characters from the end of every URL in my list' ?
Current Output:
['https://twitter.com/GVNyqWEu5u', 'https://twitter.com/GVNyqWEu5u'', 'https://twitter.com/GVNyqWEu5u##', 'https://twitter.com/GVNyqWEu5u"']
Desired Output:
['https://twitter.com/GVNyqWEu5u', 'https://twitter.com/GVNyqWEu5u', 'https://twitter.com/GVNyqWEu5u', 'https://twitter.com/GVNyqWEu5u']
You would appreciate that not all elements in the 'Current Output' list have special characters / punctuation towards the end. The task is to identify & remove characters / punctuation only from the list elements who have them.
I am using the following Regex to extract twitter URLs from the Tweet Text: lst = re.findall('(http.?://[^\s]+)', text)
Can I remove the special characters / punctuation towards the end of the URL, in this step itself ?
Full Code:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
from socket import timeout
import ssl
import re
import csv
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
count = 0
file = "Test.CSV"
with open(file,'r', encoding='utf-8') as f, open('output_themes_1.csv', 'w', newline='', encoding='utf-8') as ofile:
next(f)
reader = csv.reader(f)
writer = csv.writer(ofile)
fir = 'S.No.', 'Article_Id', 'Validity', 'Content', 'Geography', 'URL'
writer.writerow(fir)
for line in reader:
count = count+1
text = line[5]
lst = re.findall('(http.?://[^\s]+)', text)
if not lst:
x = count, line[0], 'Empty List', text, line[8], line[6]
print (x)
writer.writerow(x)
else:
try:
for url in lst:
try:
html = urllib.request.urlopen(url, context=ctx, timeout=60).read()
#html = urllib.request.urlopen(urllib.parse.quote(url, errors='ignore'), context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.string
str_title = str (title)
if 'Twitter' in str_title:
if len(lst) > 1: break
else: continue
else:
y = count, line[0], 'Parsed', str_title, line[8], url
print (y)
writer.writerow(y)
except UnicodeEncodeError as e:
b_url = url.encode('ascii', errors='ignore')
n_url = b_url.decode("utf-8")
try:
html = urllib.request.urlopen(n_url, context=ctx, timeout=90).read()
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.string
str_title = str (title)
if 'Twitter' in str_title:
if len(lst) > 1: break
else: continue
else:
z = count, line[0], 'Parsed_2', str_title, line[8], url
print (z)
writer.writerow(z)
except Exception as e:
a = count, line[0], str(e), text, line[8], url
print (a)
writer.writerow(a)
except Exception as e:
b = count, line[0], str(e), text, line[8], url
print (b)
writer.writerow(b)
print ('Total Rows Analyzed:', count)
Assuming the special characters occur at the end of the string you may use:
mydata = ['https://twitter.com/GVNyqWEu5u', "https://twitter.com/GVNyqWEu5u'", 'https://twitter.com/GVNyqWEu5u##', 'https://twitter.com/GVNyqWEu5u"']
mydata = [re.sub('[^a-zA-Z0-9]+$','',item) for item in mydata]
print(mydata)
Prints:
['https://twitter.com/GVNyqWEu5u', 'https://twitter.com/GVNyqWEu5u', 'https://twitter.com/GVNyqWEu5u', 'https://twitter.com/GVNyqWEu5u']
Assuming your list is called urls:
def remove_special_chars(url, char_list=None):
if char_list is None:
# Build your own default list here
char_list = ['#', '%']
for character in char_list:
if url.endswith(character):
return remove_special_chars(url[:-1], char_list)
return url
urls = [remove_special_chars(url) for url in urls]
If you want to get rid of a special set of characters just change either the default value or pass a proper list as an argument
You could try this -
lst = [re.sub('[=" ]$', '', i) for i in re.findall('(http.?://[^\s]+)', text)]
You can just add more characters that you want to replace in your sub according to your requirements
Basically what I'm trying to do is make a program in python which takes a URL, copys the source, and pulls all comments out and presents them to the user.
import urllib2
import html2text
import PullsCommentsOut.pullscommentsout
url = raw_input('Please input URL with the text you want to analyze: ')
page = urllib2.urlopen(url)
html_content = page.read().decode('utf8')
rendered_content = html2text.html2text(html_content).encode('ascii',
'ignore')
f = open('file_text.txt', 'wb')
f.write(rendered_content)
f.close()
result = PullsCommentsOut.pullscommentsout(html_content)
print result
And my second file, 'PullsCommentsOut'
import re
def pullscommentsout():
def comment_remover(text):
def replacer(match):
s = match.group(0)
if s.startswith('/'):
print s
return " " # note: a space and not an empty string
else:
return s
pattern = re.compile(
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE
)
return re.sub(pattern, replacer, text)
fd = open("test.c", "r")
buf = fd.read()
comment_remover(buf)
For the life of me I can't figure out why Python doesn't think I'm not importing the proper module? It doesn't make sense.
I need to add more text so it allows me to post, so, how are you all doing? I'm doing pretty good I guess. No complaints.
Im writing a script where one of its functions is to read a CSV file that contain URLs on one of its rows. Unfortunately the system that create those CSVs doesn't put double-quotes on values inside the URL column so when the URL contain commas it breaks all my csv parsing.
This is the code I'm using:
with open(accesslog, 'r') as csvfile, open ('results.csv', 'w') as enhancedcsv:
reader = csv.DictReader(csvfile)
for row in reader:
self.uri = (row['URL'])
self.OriCat = (row['Category'])
self.query(self.uri)
print self.URL+","+self.ServerIP+","+self.OriCat+","+self.NewCat"
This is a sample URL that is breaking up the parsing - this URL comes on the row named "URL". (note the commas at the end)
ams1-ib.adnxs.com/ww=1238&wh=705&ft=2&sv=43&tv=view5-1&ua=chrome&pl=mac&x=1468251839064740641,439999,v,mac,webkit_chrome,view5-1,0,,2,
The following row after the URL always come with a numeric value between parenthesis. Ex: (9999) so this could be used to define when the URL with commas end.
How can i deal with a situation like this using the csv module?
You will have to do it a little more manually. Try this
def process(lines, delimiter=','):
header = None
url_index_from_start = None
url_index_from_end = None
for line in lines:
if not header:
header = [l.strip() for l in line.split(delimiter)]
url_index_from_start = header.index('URL')
url_index_from_end = len(header)-url_index_from_start
else:
data = [l.strip() for l in line.split(delimiter)]
url_from_start = url_index_from_start
url_from_end = len(data)-url_index_from_end
values = data[:url_from_start] + data[url_from_end+1:] + [delimiter.join(data[url_from_start:url_from_end+1])]
keys = header[:url_index_from_start] + header[url_index_from_end+1:] + [header[url_index_from_start]]
yield dict(zip(keys, values))
Usage:
lines = ['Header1, Header2, URL, Header3',
'Content1, "Content2", abc,abc,,abc, Content3']
result = list(process(lines))
assert result[0]['Header1'] == 'Content1'
assert result[0]['Header2'] == '"Content2"'
assert result[0]['Header3'] == 'Content3'
assert result[0]['URL'] == 'abc,abc,,abc'
print(result)
Result:
>>> [{'URL': 'abc,abc,,abc', 'Header2': '"Content2"', 'Header3': 'Content3', 'Header1': 'Content1'}]
Have you considered using Pandas to read your data in?
Another possible solution would be to use regular expressions to pre-process the data...
#make a list of everything you want to change
old = re.findall(regex, f.read())
#append quotes and create a new list
new = []
for url in old:
url2 = "\""+url+"\""
new.append(url2)
#combine the lists
old_new = list(zip(old,new))
#Then use the list to update the file:
f = open(filein,'r')
filedata = f.read()
f.close()
for old,new in old_new:
newdata = filedata.replace(old,new)
f = open(filein,'w')
f.write(newdata)
f.close()
How can I find only words that are unique to a text file? If a word is used frequently by in other files then it gets dropped.
Here is a reference http://sahandsaba.com/visualizing-philosophers-and-scientists-by-the-words-they-used-with-d3js-and-python.html
I need a script which loops through all text files in a folder and outputs the results in Json format.
My code so far :
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from pprint import pprint as pp
from glob import glob
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
import codecs
import jinja2
import json
import os
def get_raw_data():
texts = []
for x in range(1,95):
file_name = str(x+1)+".txt"
with codecs.open(file_name,"rU","utf-8") as myfile:
data = myfile.read()
texts.append(data)
yield file_name, '\n'.join(texts)
class StemTokenizer(object):
def __init__(self):
self.ignore_set = {'footnote'}
def __call__(self, doc):
words = []
for word in word_tokenize(doc):
word = word.lower()
w = wn.morphy(word)
if w and len(w) > 1 and w not in self.ignore_set:
words.append(w)
return words
def process_text(counts, vectorizer, text, file_name, index):
result = {w: counts[index][vectorizer.vocabulary_.get(w)]
for w in vectorizer.get_feature_names()}
result = {w: c for w, c in result.iteritems() if c > 4}
normalizing_factor = max(c for c in result.itervalues())
result = {w: c / normalizing_factor
for w, c in result.iteritems()}
return result
def main():
data = list(get_raw_data())
print('Data loaded')
n = len(data)
vectorizer = CountVectorizer(stop_words='english', min_df=(n-1) / n,tokenizer=StemTokenizer())
counts = vectorizer.fit_transform(text for p, text in data).toarray()
print('Vectorization done.')
print (counts)
for x in range(95):
file_name = str(x+1)+".txt"
# print (text)
for i, (text) in enumerate(data):
print (file_name)
# print (text)
with codecs.open(file_name,"rU","utf-8") as myfile:
text = myfile.read()
result = process_text(counts, vectorizer, text, file_name, i)
print (result)
if __name__ == '__main__':
main()
Looks like you've got a bunch of files named 1.txt, 2.txt, ... 95.txt, and you want to find words that occur in one file only. I'd just gather all words, counting how many files each one occurs in; and print out the singletons.
from collections import Counter
import re
fileids = [ str(n+1)+".txt" for n in range(95) ]
filecounts = Counter()
for fname in fileids:
with open(fname) as fp: # Add encoding if really needed
text = fp.read().lower()
words = re.split(r"\W+", text) # Keep letters, drop the rest
filecounts.update(set(words))
singletons = [ word in filecounts if filecounts[word] == 1 ]
print(" ".join(singletons))
Done. You don't need scikit, you don't need the nltk, you don't need a pile of IR algorithms. You can use the list of singletons in an IR algorithm, but that's a different story.
def parseText():
# oFile: text file to test
# myWord: word we are looking for
# Get all lines into list
aLines = oFile.readlines()
# Perform list comprehension on lines to test if the word is found
for sLine in aLines:
# Parse the line (remove spaces), returns list
aLine = sLine.split()
# Iterate words and test to see if they match our word
for sWord in aLines:
# if it matches, append it to our list
if sWord == myWord: aWords.append( sWord )
# Create empty list to store all instances of the word that we may find
aWords = []
# Prompt user to know what word to search
myWord = str( raw_input( 'what word to searh:' ) )
# Call function
parseText()
# Check if list has at least one element
if len( aWords ) < 1: print 'Word not found in file'
else: print str( len( aWords ) ) + ' instances of our word found in file'