How to check for image use in tweets in Tweepy - python

I have written code to extract tweets from a list of users [handles]. I am writing the information to a .txt file called "results".
with open("results", "w") as fp:
for handle in handles:
print("Analyzing tweets from " + handle + "...")
user = api.get_user(id=handle)
fp.write("Handle: " + handle + "\n")
fp.write("Name: " + user.name + "\n")
fp.write("Description: " + str(user.description.encode(sys.stdout.encoding, errors='replace')) + "\n")
fp.write("Followers: " + str(user.followers_count) + "\n")
fp.write("Following: " + str(user.friends_count) + "\n")
tweet_counter = 0
prosocial_tweets_count = 0
regular_tweets_count = 0
all_tweets = []
social_tweets_len = []
regular_tweets_len = []
social_tweets_valence = []
regular_tweets_valence = []
regular_attachments = 0
social_attachments = 0
for tweet in tweepy.Cursor(api.user_timeline, id=user.id).items():
#control for timeline
dt = tweet.created_at
if dt > date_until:
continue
if dt < date_from:
break # XXX: I hope it's OK to break here
if include_retweets == "no" and tweet.text.startswith("RT"):
continue
if include_replies == "no" and tweet.in_reply_to_user_id:
continue
tweet_counter += 1
for word in vocabulary:
if word in tweet.text.lower():
#increase count of pro social tweets
prosocial_tweets_count += 1
#clean the tweet for valence analysis
clean = TextBlob(tweet.text.lower())
#calculate valence
valence = clean.sentiment.polarity
#append the valence to a list
social_tweets_valence.append(valence)
#append the length of the tweet to a list
social_tweets_len.append(len(tweet.text))
#check if there is an attachment
counting = tweet.text.lower()
counting_attachments = counting.count(" https://t.co/")
social_attachments = social_attachments + counting_attachments
#write date
fp.write(" * " + str(dt) + "\n")
#write the tweet
fp.write(" " + str(tweet.text.encode(sys.stdout.encoding, errors='replace')) + "\n")
#write the length of the tweet
fp.write(" Length of tweet " + str(len(tweet.text)) + "\n")
#write the valence of the tweet
fp.write(" Tweet valance " + str(valence) + "\n")
#write the retweets of the tweet
fp.write(" Retweets count: " + str(tweet.retweet_count) + "\n")
#write the likes of the tweet
fp.write(" Likes count: " + str(tweet.favorite_count) + "\n")
# Report each tweet only once whenever it contains more than one prosocial words
break
else:
#this code runs if the tweet is not prosocial
regular_tweets_count += 1
clean = TextBlob(tweet.text.lower())
valence = clean.sentiment.polarity
counting = tweet.text.lower()
counting_attachments = counting.count(" https://t.co/")
regular_attachments = regular_attachments + counting_attachments
regular_tweets_valence.append(valence)
regular_tweets_len.append(len(tweet.text))
attachments = regular_attachments + social_attachments
I was wondering whether anyone knows of any nice way to check if the tweets contains images or videos. I would also like to create a list of average use of images and videos per user.

If you look at This thread, you will see that all media in a tweet are actually stored in tweet.entities['media'].
Therefore if you want to know if a given tweet (in the format tweepy.models.Status used by tweepy) contains a picture, you could try this:
try:
print(True in [medium['type'] == 'photo' for medium in tweet.entities['media']])
except:
print("No picture in this tweet")
I hope it helps.

Data is in JSON format when we fetch it from Twitter API. Though it contains all data about that id, and comment in form of value and fields. So if you just want to check whether image already exist or not you make a conditional statement stating
if(image == TRUE){
THEN 'yes'
}
ELSE
'no'

Related

Updates to text file are not being parsed using python

I'm parsing data from a text file ('placlog.txt') that is continuously being updated. As I run the code everything prints as expected, but if there are any updates to the placlog file while the code is running it is not printed.
The placlog file is being updated by a third-party program and I am using the above code to read the file and print any updates.
Once formatted, the text should be sent via a Telegram API. This part is also working initially.
import urllib.parse
import time
import requests
import os
def post_to_telegram(msg):
#print(msg)
base_url = 'https://api.telegram.org/bot&text="{}'.format(msg)
requests.get(base_url)
def check_url_inMsgList(stringToMatch, msgList):
for i in msgList:
if (stringToMatch in i):
return False
return True
try:
f = open("oldFile.txt", "r")
msgList = f.read().split("\n")
f.close()
except:
f = open("oldFile.txt", "w")
msgList = []
f.close()
selections = []
urr = ""
name = ""
pie = ""
ourLines = 2400
url_found = 0
name_found = 0
pie_found = 0
while (True):
file1 = open('placlog.txt', 'r')
Lines = file1.readlines()
file1.close()
while (True):
# print("-------------------------------")
if (ourLines == len(Lines)):
break
elif (ourLines > len(Lines)):
ourLines = 0
else:
txt = Lines[ourLines].strip()
tlist = txt.split("&")
ourLines = ourLines + 1
for subtxt in tlist:
if "eventurl=" in subtxt:
a = subtxt[9:len(subtxt) - 3]
url = "www.awebsite.com/%23" + a.replace("%23", "/")
#url = url.replace("%23", "#")
for i in range(10):
if "F" + str(i) + "/" in url:
url = url.split("F" + str(i) + "/")[0] + "F" + str(i) + "/"
urr = url
url_found = 1
elif "bit=" in subtxt:
name = urllib.parse.unquote(subtxt[4:len(subtxt)])
name_found = 1
elif "pie\":" in subtxt:
a = subtxt.split("price")[1]
pie = a.split("\"")[2]
pie = float(pie)
pie = round(pie, 1)
pie = str(pie)
pie_found = 1
selections.append(url + name + pie)
msg = (url + " " + name + " " + pie)
stringToFind = url + " " + name
if (check_url_inMsgList(stringToFind, msgList)):
post_to_telegram(msg)
msgList.append(msg)
print(msg)
f = open("oldFile.txt", "a+")
f.write(msg + "\n")
f.close()
time.sleep(0.5)
elif "minodds=" in subtxt:
a = subtxt.split("minodds=")[1]
pie = a.split("&")[0]
pie = float(pie)
rie = round(pie, 1)
pie = str(pie)
pie_found = 1
selections.append(url + name + pie)
msg = (url + " " + name + " " + pie)
stringToFind = url + " " + name
if (check_url_inMsgList(stringToFind, msgList)):
post_to_telegram(msg)
msgList.append(msg)
print(msg)
f = open("oldFile.txt", "a+")
f.write(msg + "\n")
f.close()
time.sleep(0.5)
time.sleep(1)
I would recommend using watchdog, and seeing if that helps your situation. It can monitor for file system changes, so you could define a function which is executed when the placlog.txt file is changed/updated.
A good guide can be found here: http://thepythoncorner.com/dev/how-to-create-a-watchdog-in-python-to-look-for-filesystem-changes/
From that guide, you can simply change the functions defined to suit your needs i.e.
def on_modified(event):
if event.src_path == "path/to/placlog.txt":
with open('placlog.txt', 'r') as placlog:
lines = file1.readlines()
Could you try this out and see if it helps? I still recommend the with statement for file i/o since you always want your file to close no matter what.
This link might also be useful since they are also monitoring a single .txt file: Python Watchdog - src_path inconsistent
watchdog documentation: https://pythonhosted.org/watchdog/
Note: Deleted the old answer since you clarified the question.

Python API call JSON to CSV iteration overwriting data

New to Python...trying to perform an API call and output the data to CSV.
Works fine for a single variable being passed but when I parse through a list the last item in the list is the only output I receive.
Looking to find the best way to approach this. Not sure if it's just an issue of logic or I need to find a way to consistently append to the file and remove results when it's rerun.
list = open(infile, "r")
print("-------------Attempting to Query Data----------------")
for item in list:
try:
if inp == 1:
eval_string = "&value=" + item
elif inp == 2:
eval_string = "&value__regexp=.*." + item + "*"
else:
print("Invalid input!")
result = requests.get(url + "api_key=" + api + "&username=" + user + eval_string + "&limit=" + str(limit) + "&status=" + status)
data = result.json()
if result.status_code == 200:
with open(outfile, 'w', newline='') as data_file:
writer = csv.writer(data_file)
count = 0
for obj in data['objects']:
if count == 0:
header = 'value', 'confidence', 'type', 'source', 'date', 'status'
writer.writerow(header)
count += 1
writer.writerow([obj['value'], obj['confidence'], obj['type'], obj['source'], obj['date'], obj['status']])
else:
print("-------------Failed to connect - check API config info or that site is up----------------")
except OSError:
print("Failed to query.")
print("-------------Results returned in " + outfile + "----------------")

Twitter API - not collecting all tweets using Tweepy

I'm using Tweepy to collect tweets from the Twitter API by their Tweet ID.
Im trying to read in a file full of the IDs, get the previous tweet from the conversation stream, then store that tweet and its author's screen name etc in a text file. Some of the tweets have been deleted or the user's profile has been set to private, in which case I want to ignore that tweet and move on to the next. However, for some reason, I'm not collecting all accessible tweets. Its storing maybe 3/4 of all tweets that aren't private and haven't been deleted. Any ideas why its not catching everything?
Thanks in advance.
def getTweet(tweetID, tweetObj, callTweetObj, i):
tweet = callTweetObj.text.encode("utf8")
callUserName = callTweetObj.user.screen_name
callTweetID = tweetObj.in_reply_to_status_id_str
with open("call_tweets.txt", "a") as calltweets:
output = (callTweetObj.text.encode('utf-8')+ "\t" + callTweetID + "\t" + tweetID)
calltweets.write(output)
print output
with open("callauthors.txt", "a") as callauthors:
cauthors = (callUserName+ "\t" + "\t" + callTweetID + "\n")
callauthors.write(cauthors)
with open("callIDs.txt", "a") as callIDs:
callIDs.write(callTweetID + "\n")
with open("newResponseIDs.txt", "a") as responseIDs:
responseIDs.write(tweetID)
count = 0
file = "Response_IDs.txt"
with open(file, 'r+') as f:
lines = f.readlines()
for i in range(0, len(lines)):
tweetID = lines[i]
sleep(5)
try:
tweetObj = api.get_status(tweetID)
callTweetID = tweetObj.in_reply_to_status_id_str
callTweetObj = api.get_status(callTweetID)
getTweet(tweetID, tweetObj, callTweetObj, i)
count = count+1
print count
except:
pass
You haven't specified information regarding the response coming back from api.get_status, so it's hard to detect what the error is.
However, it might be you have reached the rate limit for the statuses/show/:id request. The API specifies this request is limited to 180 requests a window.
You can use Tweepy to call application/rate_limit_status:
response = api.rate_limit_status()
remaining = response['resources']['statuses']['/statuses/show/:id']['remaining']
assert remaining > 0

Python export to file via ofile without bracket characters

I successfully simplified a python module that imports data from a spectrometer
(I'm a total beginner, somebody else wrote the model of the code for me...)
I only have one problem: half of the output data (in a .csv file) is surrounded by brackets: []
I would like the file to contain a structure like this:
name, wavelength, measurement
i.e
a,400,0.34
a,410,0.65
...
but what I get is:
a,400,[0.34]
a,410,[0.65]
...
Is there any simple fix for this?
Is it because measurement is a string?
Thank you
import serial # requires pyserial library
ser = serial.Serial(0)
ofile = file( 'spectral_data.csv', 'ab')
while True:
name = raw_input("Pigment name [Q to finish]: ")
if name == "Q":
print "bye bye!"
ofile.close()
break
first = True
while True:
line = ser.readline()
if first:
print " Data incoming..."
first = False
split = line.split()
if 10 <= len(split):
try:
wavelength = int(split[0])
measurement = [float(split[i]) for i in [6]]
ofile.write(str(name) + "," + str(wavelength) + "," + str(measurement) + '\n')
except ValueError:
pass # handles the table heading
if line[:3] == "110":
break
print " Data gathered."
ofile.write('\n')
do this:
measurement = [float(split[i]) for i in [6]]
ofile.write(str(name) + "," + str(wavelength) + "," + ",".join(measurement) + '\n')
OR
ofile.write(str(name) + "," + str(wavelength) + "," + split[6] + '\n')

Extract specific entries from blastx output file, write to new file

I have created a script that successfully searches for keywords (specified by user) within a Blastx output file in XML format. Now, I need to write those records (query, hit, score, evalue, etc) that contain the keyword in the alignment title to a new file.
I have created separate lists for each of the query titles, hit title, e-value and alignment lengths but cannot seem to write them to a new file.
Problem #1: what if Python errors, and one of the lists is missing a value...? Then all the other lists will be giving wrong information in reference to the query ("line slippage", if you will...).
Problem #2: even if Python doesn't error, and all the lists are the same length, how can I write them to a file so that the first item in each list is associated with each other (and thus, item #10 from each list is also associated?) Should I create a dictionary instead?
Problem#3: dictionaries have only a single value for a key, what if my query has several different hits? Not sure if it will be overwritten or skipped, or if it will just error. Any suggestions? My current script:
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import re
#obtain full path to blast output file (*.xml)
outfile = input("Full path to Blast output file (XML format only): ")
#obtain string to search for
search_string = input("String to search for: ")
#open the output file
result_handle = open(outfile)
#parse the blast record
blast_records = NCBIXML.parse(result_handle)
#initialize lists
query_list=[]
hit_list=[]
expect_list=[]
length_list=[]
#create 'for loop' that loops through each HIGH SCORING PAIR in each ALIGNMENT from each RECORD
for record in blast_records:
for alignment in record.alignments: #for description in record.descriptions???
for hsp in alignment.hsps: #for title in description.title???
#search for designated string
search = re.search(search_string, alignment.title)
#if search comes up with nothing, end
if search is None:
print ("Search string not found.")
break
#if search comes up with something, add it to a list of entries that match search string
else:
#option to include an 'exception' (if it finds keyword then DOES NOT add that entry to list)
if search is "trichomonas" or "entamoeba" or "arabidopsis":
print ("found exception.")
break
else:
query_list.append(record.query)
hit_list.append(alignment.title)
expect_list.append(expect_val)
length_list.append(length)
#explicitly convert 'variables' ['int' object or 'float'] to strings
length = str(alignment.length)
expect_val = str(hsp.expect)
#print ("\nquery name: " + record.query)
#print ("alignment title: " + alignment.title)
#print ("alignment length: " + length)
#print ("expect value: " + expect_val)
#print ("\n***Alignment***\n")
#print (hsp.query)
#print (hsp.match)
#print (hsp.sbjct + "\n\n")
if query_len is not hit_len is not expect_len is not length_len:
print ("list lengths don't match!")
break
else:
qrylen = len(query_list)
query_len = str(qrylen)
hitlen = len(hit_list)
hit_len = str(hitlen)
expectlen = len(expect_list)
expect_len = str(expectlen)
lengthlen = len(length_list)
length_len = str(lengthlen)
outpath = str(outfile)
#create new file
outfile = open("__Blast_Parse_Search.txt", "w")
outfile.write("File contains entries from [" + outpath + "] that contain [" + search_string + "]")
outfile.close
#write list to file
i = 0
list_len = int(query_len)
for i in range(0, list_len):
#append new file
outfile = open("__Blast_Parse_Search.txt", "a")
outfile.writelines(query_list + hit_list + expect_list + length_list)
i = i + 1
#write to disk, close file
outfile.flush()
outfile.close
print ("query list length " + query_len)
print ("hit list length " + hit_len)
print ("expect list length " + expect_len)
print ("length list length " + length_len + "\n\n")
print ("first record: " + query_list[0] + " " + hit_list[0] + " " + expect_list[0] + " " + length_list[0])
print ("last record: " + query_list[-1] + " " + hit_list[-1] + " " + expect_list[-1] + " " + length_list[-1])
print ("\nFinished.\n")
If I understand your problem correctly you could use a default value for the line slippage thing like:
try:
x(list)
except exception:
append_default_value(list)
http://docs.python.org/tutorial/errors.html#handling-exceptions
or use tuples for dictionary keys like (0,1,1) and use the get method for your default value.
http://docs.python.org/py3k/library/stdtypes.html#mapping-types-dict
If you need to maintain data structures in your output files you might try using shelve:
or you could append some type of reference after each record and give each record a unique id for example '#32{somekey:value}#21#22#44#'
again you can have multiple keys using a tuple.
I don't know if that helps, you might clarify exactly what parts of your code you have trouble with. Like x() gives me output y but I expect z.

Categories

Resources