AttributeError: 'Document' object has no attribute 'dfs' - python

I attached the full error that I am receiving below and want to know what is causing this.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-154-d356d4172c8c> in <module>
23 save_dictionary(doc.tfs, "dfs_" + str(doc.id) + ".txt")
24
---> 25 vectorize("./textfiles")
26
<ipython-input-154-d356d4172c8c> in vectorize(data_path)
14 Documents.append(doc)
15
---> 16 save_dictionary(doc.dfs, "tf_" + str(doc.id) + ".txt")
17
18 dfs = {}
AttributeError: 'Document' object has no attribute 'dfs'
I also have attached the full code to my program to help troubleshoot my issue. I am not to sure what I would do would I need to do to fix this issue. If I change doc.dfs to doc.tfs it works but since dfs is a dictionary I don't think it works.
class Document:
def __init__(self, doc_id):
# create a new document with its ID
self.id = doc_id
# create an empty dictionary
# that will hold the term frequency (TF) counts
self.tfs = {}
def tokenization(self, text):
# split a title into words,
# using space " " as delimiter
words = text.lower().split(" ")
for word in words:
# for each word in the list
if word in self.tfs:
# if it has been counted in the TF dictionary
# add 1 to the count
self.tfs[word] = self.tfs[word] + 1
else:
# if it has not been counted,
# initialize its TF with 1
self.tfs[word] = 1
def save_dictionary(diction_data, file_path_name):
f = open(file_path_name, "w+")
for key in diction_data:
# Separate the key from the frequency with a space and
# add a newline to the end of each key value pair
f.write(key + " " + str(diction_data[key]) + "\n")
f.close()
def vectorize(data_path):
Documents = []
for i in range(1, 21):
file_name = "./textfiles/"+ str(i) + ".txt"
# create a new document with an ID
doc = Document(i+1)
#Read the files
with open(file_name,'r') as f:
text = f.read()
#compute the term frequencies
#read in the files contents
doc.tokenization(text)
# add the documents to the lists
Documents.append(doc)
save_dictionary(doc.tfs, "tf_" + str(doc.id) + ".txt")
DFS = {}
for doc in Documents:
for word in doc.tfs:
DFS[word] = DFS.get(word,0) + 1
save_dictionary(doc.DFS, "DFS_" + str(doc.id) + ".txt")
vectorize("./textfiles")
I have looked at the class and I think the issue is coming from there. I am unsure what exactly is causing this problem but I would want whatever words not in tfs to be saved to dfs and then when I run it have files with .tfs and also .dfs files.

Related

How to convert your script into a class?

I wrote a code to convert some text in pdf file into a pandas dataframe. Code works very well normally, but when I try to fit it into class and define function for it, it returns with error.
import pdfplumber
import pandas as pd
import re
cols = ["Declaration Number", "Declaration Date", "Warehouse", "Quantity", "Number of boxes", "Product name", "Invoice Number"]
dataset = []
quant = []
date = []
decl_date = []
decl = re.compile(r'\d{8}AN\d{6}')
decd = re.compile(r'\d{2}\.\d{2}\.\d{4}')
whse = re.compile(r'ANTREPO | LİMAN')
qty = re.compile(r'\d.KAP')
prod = re.compile(r'Ticari')
invNo = re.compile(r'Fatura')
class pdf():
def __init__(self):
self.kap = None
self.kg = None
def FirstPage():
with pdfplumber.open("44550500AN087999.pdf") as pdf:
page = pdf.pages[0]
text = page.extract_text()
for line in text.split('\n'):
if decl.search(line):
decl_num = line.split()[-1]
if decd.search(line):
decl_date = []
date = []
decl_date.append(line.split())
date = decl_date[1][-1]
if whse.search(line):
warehouse = line.split()
if qty.search(line):
quant = line.split()
kap = quant[0] + " " + quant[1]
kg = quant[2] + " " + quant[3]
when I run it it returns with several errors:
For instance:
<ipython-input-26-bc082b4afef0> in FirstPage()
20 date = []
21 decl_date.append(line.split())
---> 22 date = decl_date[1][-1]
23 if whse.search(line):
24 warehouse = line.split()
IndexError: list index out of range
I am probably defining the variables wrong but I am a newby so, anyone have any idea what am I doing wrong?
You are only putting one element into decl_date, and then trying to access the second element inside that list, which does not exist.
Your use of line.split() seems incorrect to me. The way you have used them essentially only puts the string into a 1-element list "string" -> ["string"].
I assume you want to split the string by using the regex in each if-statement, in that case change line.split() to pattern.split(line)[index], swapping out pattern and index

can't concat string to bytes

I'm trying grab twitter user data by their screen name using python.
The entire script does is to loop over each of the Twitter accounts in the ids variable — and for each one it will grab its profile information and add that to a row of the output file.
but I'm getting an error.
This is my code
// LIST OF TWITTER USER IDS
ids = "4816,9715012,13023422, 13393052, 14226882, 14235041, 14292458, 14335586, 14730894,\
15029174, 15474846, 15634728, 15689319, 15782399, 15946841, 16116519, 16148677, 16223542,\
16315120, 16566133, 16686673, 16801671, 41900627, 42645839, 42731742, 44157002, 44988185,\
48073289, 48827616, 49702654, 50310311, 50361094,"
// THE VARIABLE USERS IS A JSON FILE WITH DATA ON THE 32 TWITTER USERS LISTED ABOVE
users = t.lookup_user(user_id = ids)
//NAME OUR OUTPUT FILE - %i WILL BE REPLACED BY CURRENT MONTH, DAY, AND YEAR
outfn = "twitter_user_data_%i.%i.%i.txt" % (now.month, now.day, now.year)
// NAMES FOR HEADER ROW IN OUTPUT FILE
fields = "id screen_name name created_at url followers_count friends_count statuses_count \
favourites_count listed_count \
contributors_enabled description protected location lang expanded_url".split()
// INITIALIZE OUTPUT FILE AND WRITE HEADER ROW
outfp = open(outfn, "w")
//outfp.write(string.join(fields, "\t") + "\n") # header
outfp.write("\t".join(fields) + "\n") # header
// THIS BLOCK WILL LOOP OVER EACH OF THESE IDS, CREATE VARIABLES, AND OUTPUT TO FILE
for entry in users:
// CREATE EMPTY DICTIONARY
r = {}
for f in fields:
r[f] = ""
// ASSIGN VALUE OF 'ID' FIELD IN JSON TO 'ID' FIELD IN OUR DICTIONARY
r['id'] = entry['id']
// SAME WITH 'SCREEN_NAME' HERE, AND FOR REST OF THE VARIABLES
r['screen_name'] = entry['screen_name']
r['name'] = entry['name']
r['created_at'] = entry['created_at']
r['url'] = entry['url']
r['followers_count'] = entry['followers_count']
r['friends_count'] = entry['friends_count']
r['statuses_count'] = entry['statuses_count']
r['favourites_count'] = entry['favourites_count']
r['listed_count'] = entry['listed_count']
r['contributors_enabled'] = entry['contributors_enabled']
r['description'] = entry['description']
r['protected'] = entry['protected']
r['location'] = entry['location']
r['lang'] = entry['lang']
// NOT EVERY ID WILL HAVE A 'URL' KEY, SO CHECK FOR ITS EXISTENCE WITH IF CLAUSE
if 'url' in entry['entities']:
r['expanded_url'] = entry['entities']['url']['urls'][0]['expanded_url']
else:
r['expanded_url'] = ''
print(r)
// CREATE EMPTY LIST
lst = []
// ADD DATA FOR EACH VARIABLE
for f in fields:
lst.append(str(r[f]).replace("\/", "/"))
// WRITE ROW WITH DATA IN LIST
//outfp.write(string.join(lst, "\t").encode("utf-8") + "\n")
outfp.write("\t".join(lst).encode('utf-8') + '\n')
outfp.close()
The error message
TypeError Traceback (most recent call last)
<ipython-input-54-441137b1bb4d> in <module>()
37 #WRITE ROW WITH DATA IN LIST
38 #outfp.write(string.join(lst, "\t").encode("utf-8") + "\n")
---> 39 outfp.write("\t".join(lst).encode('utf-8') + '\n')
40
41 outfp.close()
TypeError: can't concat str to bytes
Any idea on how to fix this? The version of Python is 3.6.5
Your help will be greatly appreciated. Thanks.
Edite:
This screenshot of part of my file after I opened the output file in the binary mode
outfp.write("\t".join(lst).encode('utf-8') + '\n')
After you do .encode() on a string you get an instance of bytes. You can't add another string (like \n) to bytes. That's what the error is telling you.
So you need to add the \n before you encode the string. Like below:
outfp.write(("\t".join(lst) + '\n').encode('utf-8'))

How to fix errors IndexError: list index out of range

I would like load data which are 10 categories of document, each cateory contains text files, but I keep getting the following error:
IndexError: list index out of range
THis is code :
def load_data(folder):
data = []
files = [join(folder, x) for x in os.listdir(folder)]
for file in files:
topic = file.split("/")[9] # this is where the error occurs
label = topic.replace(" ", "_")
name = "__label__" + label
with open(file, "rb") as f:
content = f.read()
content = content.decode('utf-16')
content = " ".join(i for i in content.split())
data.append(name + " " + content)
return data
Easy way to debug this would be to add print statements and check what the objects hold. For e.g. in this case, you can add 2 print statements at the beginning of the for loop. This would help you to figure out why you are getting IndexError
def load_data(folder):
data = []
files = [join(folder, x) for x in os.listdir(folder)]
for file in files:
print(file)
print(file.split("/"))
topic = file.split("/")[9] # this is where the error occurs
label = topic.replace(" ", "_")
name = "__label__" + label
with open(file, "rb") as f:
content = f.read()
content = content.decode('utf-16')
content = " ".join(i for i in content.split())
data.append(name + " " + content)
return data

How to check for image use in tweets in Tweepy

I have written code to extract tweets from a list of users [handles]. I am writing the information to a .txt file called "results".
with open("results", "w") as fp:
for handle in handles:
print("Analyzing tweets from " + handle + "...")
user = api.get_user(id=handle)
fp.write("Handle: " + handle + "\n")
fp.write("Name: " + user.name + "\n")
fp.write("Description: " + str(user.description.encode(sys.stdout.encoding, errors='replace')) + "\n")
fp.write("Followers: " + str(user.followers_count) + "\n")
fp.write("Following: " + str(user.friends_count) + "\n")
tweet_counter = 0
prosocial_tweets_count = 0
regular_tweets_count = 0
all_tweets = []
social_tweets_len = []
regular_tweets_len = []
social_tweets_valence = []
regular_tweets_valence = []
regular_attachments = 0
social_attachments = 0
for tweet in tweepy.Cursor(api.user_timeline, id=user.id).items():
#control for timeline
dt = tweet.created_at
if dt > date_until:
continue
if dt < date_from:
break # XXX: I hope it's OK to break here
if include_retweets == "no" and tweet.text.startswith("RT"):
continue
if include_replies == "no" and tweet.in_reply_to_user_id:
continue
tweet_counter += 1
for word in vocabulary:
if word in tweet.text.lower():
#increase count of pro social tweets
prosocial_tweets_count += 1
#clean the tweet for valence analysis
clean = TextBlob(tweet.text.lower())
#calculate valence
valence = clean.sentiment.polarity
#append the valence to a list
social_tweets_valence.append(valence)
#append the length of the tweet to a list
social_tweets_len.append(len(tweet.text))
#check if there is an attachment
counting = tweet.text.lower()
counting_attachments = counting.count(" https://t.co/")
social_attachments = social_attachments + counting_attachments
#write date
fp.write(" * " + str(dt) + "\n")
#write the tweet
fp.write(" " + str(tweet.text.encode(sys.stdout.encoding, errors='replace')) + "\n")
#write the length of the tweet
fp.write(" Length of tweet " + str(len(tweet.text)) + "\n")
#write the valence of the tweet
fp.write(" Tweet valance " + str(valence) + "\n")
#write the retweets of the tweet
fp.write(" Retweets count: " + str(tweet.retweet_count) + "\n")
#write the likes of the tweet
fp.write(" Likes count: " + str(tweet.favorite_count) + "\n")
# Report each tweet only once whenever it contains more than one prosocial words
break
else:
#this code runs if the tweet is not prosocial
regular_tweets_count += 1
clean = TextBlob(tweet.text.lower())
valence = clean.sentiment.polarity
counting = tweet.text.lower()
counting_attachments = counting.count(" https://t.co/")
regular_attachments = regular_attachments + counting_attachments
regular_tweets_valence.append(valence)
regular_tweets_len.append(len(tweet.text))
attachments = regular_attachments + social_attachments
I was wondering whether anyone knows of any nice way to check if the tweets contains images or videos. I would also like to create a list of average use of images and videos per user.
If you look at This thread, you will see that all media in a tweet are actually stored in tweet.entities['media'].
Therefore if you want to know if a given tweet (in the format tweepy.models.Status used by tweepy) contains a picture, you could try this:
try:
print(True in [medium['type'] == 'photo' for medium in tweet.entities['media']])
except:
print("No picture in this tweet")
I hope it helps.
Data is in JSON format when we fetch it from Twitter API. Though it contains all data about that id, and comment in form of value and fields. So if you just want to check whether image already exist or not you make a conditional statement stating
if(image == TRUE){
THEN 'yes'
}
ELSE
'no'

Extract specific entries from blastx output file, write to new file

I have created a script that successfully searches for keywords (specified by user) within a Blastx output file in XML format. Now, I need to write those records (query, hit, score, evalue, etc) that contain the keyword in the alignment title to a new file.
I have created separate lists for each of the query titles, hit title, e-value and alignment lengths but cannot seem to write them to a new file.
Problem #1: what if Python errors, and one of the lists is missing a value...? Then all the other lists will be giving wrong information in reference to the query ("line slippage", if you will...).
Problem #2: even if Python doesn't error, and all the lists are the same length, how can I write them to a file so that the first item in each list is associated with each other (and thus, item #10 from each list is also associated?) Should I create a dictionary instead?
Problem#3: dictionaries have only a single value for a key, what if my query has several different hits? Not sure if it will be overwritten or skipped, or if it will just error. Any suggestions? My current script:
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import re
#obtain full path to blast output file (*.xml)
outfile = input("Full path to Blast output file (XML format only): ")
#obtain string to search for
search_string = input("String to search for: ")
#open the output file
result_handle = open(outfile)
#parse the blast record
blast_records = NCBIXML.parse(result_handle)
#initialize lists
query_list=[]
hit_list=[]
expect_list=[]
length_list=[]
#create 'for loop' that loops through each HIGH SCORING PAIR in each ALIGNMENT from each RECORD
for record in blast_records:
for alignment in record.alignments: #for description in record.descriptions???
for hsp in alignment.hsps: #for title in description.title???
#search for designated string
search = re.search(search_string, alignment.title)
#if search comes up with nothing, end
if search is None:
print ("Search string not found.")
break
#if search comes up with something, add it to a list of entries that match search string
else:
#option to include an 'exception' (if it finds keyword then DOES NOT add that entry to list)
if search is "trichomonas" or "entamoeba" or "arabidopsis":
print ("found exception.")
break
else:
query_list.append(record.query)
hit_list.append(alignment.title)
expect_list.append(expect_val)
length_list.append(length)
#explicitly convert 'variables' ['int' object or 'float'] to strings
length = str(alignment.length)
expect_val = str(hsp.expect)
#print ("\nquery name: " + record.query)
#print ("alignment title: " + alignment.title)
#print ("alignment length: " + length)
#print ("expect value: " + expect_val)
#print ("\n***Alignment***\n")
#print (hsp.query)
#print (hsp.match)
#print (hsp.sbjct + "\n\n")
if query_len is not hit_len is not expect_len is not length_len:
print ("list lengths don't match!")
break
else:
qrylen = len(query_list)
query_len = str(qrylen)
hitlen = len(hit_list)
hit_len = str(hitlen)
expectlen = len(expect_list)
expect_len = str(expectlen)
lengthlen = len(length_list)
length_len = str(lengthlen)
outpath = str(outfile)
#create new file
outfile = open("__Blast_Parse_Search.txt", "w")
outfile.write("File contains entries from [" + outpath + "] that contain [" + search_string + "]")
outfile.close
#write list to file
i = 0
list_len = int(query_len)
for i in range(0, list_len):
#append new file
outfile = open("__Blast_Parse_Search.txt", "a")
outfile.writelines(query_list + hit_list + expect_list + length_list)
i = i + 1
#write to disk, close file
outfile.flush()
outfile.close
print ("query list length " + query_len)
print ("hit list length " + hit_len)
print ("expect list length " + expect_len)
print ("length list length " + length_len + "\n\n")
print ("first record: " + query_list[0] + " " + hit_list[0] + " " + expect_list[0] + " " + length_list[0])
print ("last record: " + query_list[-1] + " " + hit_list[-1] + " " + expect_list[-1] + " " + length_list[-1])
print ("\nFinished.\n")
If I understand your problem correctly you could use a default value for the line slippage thing like:
try:
x(list)
except exception:
append_default_value(list)
http://docs.python.org/tutorial/errors.html#handling-exceptions
or use tuples for dictionary keys like (0,1,1) and use the get method for your default value.
http://docs.python.org/py3k/library/stdtypes.html#mapping-types-dict
If you need to maintain data structures in your output files you might try using shelve:
or you could append some type of reference after each record and give each record a unique id for example '#32{somekey:value}#21#22#44#'
again you can have multiple keys using a tuple.
I don't know if that helps, you might clarify exactly what parts of your code you have trouble with. Like x() gives me output y but I expect z.

Categories

Resources