How to find a value for a specific key in python - python

import requests
import json
# initial message
message = "if i can\'t let it go out of my mind"
# split into list
split_message = message.split()
def decrementList(words):
for w in [words] + [words[:-x] for x in range(1,len(words))]:
url = 'http://ws.spotify.com/search/1/track.json?q='
request = requests.get(url + "%20".join(w))
json_dict = json.loads(request.content)
num_results = json_dict['info']['num_results']
if num_results > 0:
num_removed = len(words) - len(w)
#track_title = ' '.join(words)
track_title = "If I Can't Take It with Me"
for value in json_dict.items():
if value == track_title:
print "match found"
return num_removed, json_dict
num_words_removed, json_dict = decrementList(split_message)
In the code below, I am trying to match the name of a song to my search query. In this particular query, the song will not match, but I have added a variable that will match the song for the returned query. The for loop at the end of the function is supposed to match the track title variable, but I can't figure out why it isn't working. Is there a simple way to find all values for a known key? In this case, the key is "name"

You have to search for the track title, within the tracks dictionary. So, just change your code like this
for value in json_dict["tracks"]:
if value["name"] == track_title:
it would print
match found

Related

xpath - IndexError: list index out of range after selecting element with path

I get the following Index error when trying to run the code below (found here: https://github.com/israel-dryer/Twitter-Scraper/blob/main/twitter-scraper-tut.ipynb):
card = cards[0]
IndexError: list index out of range
Since I am a Python newbie could you help me figure this out please?
Thanks a lot!!!
card = driver.find_elements_by_xpath('//div[#data-testid="tweet"]')
card = cards[0]
#function that collects tweets while scrolling; filters out sponsored tweets; saves tweets in Tuple
def get_tweet_data(card):
# get username of the tweet
username_tweet = card.find_element_by_xpath('.//span').text
# get Twitter Handle
handle_tweet = card.find_element_by_xpath('.//span[contains(text()."#")]').text
# get date of post - if no date, then sponsored tweet - then do not return
try:
date_tweet = card.find_element_by_xpath('.//time').get_attribute('datetime')
except NoSuchElementException:
return
# get Text of Tweet
comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text
responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text
text_tweet = comment + responding
# number of replies, retweets, likes
reply_count = card.find_element_by_xpath('.//div[#data-testid="reply"]').text
retweet_count = card.find_element_by_xpath('.//div[#data-testid="retweet"]').text
like_count = card.find_element_by_xpath('.//div[#data-testid="like"]').text
tweet = (username_tweet, handle_tweet, date_tweet, text_tweet, reply_count, retweet_count, like_count)
return tweet
get_tweet_data(card)
It means that driver.find_elements_by_xpath('//div[#data-testid="tweet"]') did not return anything, i.e. there are no elements that match that xpath
find_elements_by_xpath() returns a list of found elements or None if no elements are found. So in this case you need check length of 'card' before getting items from it. E.g.:
cards = driver.find_elements_by_xpath('//div[#data-testid="tweet"]')
if len(cards) > 0:
card = cards[0]
else:
raise NoSuchElementException('No cards were found')
Maybe, it would be better if you will use driver.find_element_by_xpath() instead of driver.find_elements_by_xpath() - it will return an exception if no elements are found.

Python - print 2nd argument

I am new to Python and I've written this test-code for practicing purposes, in order to find and print email addresses from various web pages:
def FindEmails(*urls):
for i in urls:
totalemails = []
req = urllib2.Request(i)
aResp = urllib2.urlopen(req)
webpage = aResp.read()
patt1 = '(\w+[-\w]\w+#\w+[.]\w+[.\w+]\w+)'
patt2 = '(\w+[\w]\w+#\w+[.]\w+)'
regexlist = [patt1,patt2]
for regex in regexlist:
match = re.search(regex,webpage)
if match:
totalemails.append(match.group())
break
#return totalemails
print "Mails from webpages are: %s " % totalemails
if __name__== "__main__":
FindEmails('https://www.urltest1.com', 'https://www.urltest2.com')
When I run it, it prints only one argument.
My goal is to print the emails acquired from webpages and store them in a list, separated by commas.
Thanks in advance.
The problem here is the line: totalemails = []. Here, you are re-instantiating the the variables totalemails to have zero entries. So, in each iteration, it only has one entry inside it. After the last iteration, you'll end up with just the last entry in the list. To get a list of all emails, you need to put the variable outside of the for loop.
Example:
def FindEmails(*urls):
totalemails = []
for i in urls:
req = urllib2.Request(i)
....

Cannot download Wordnet Error

I am trying to compile this code:
from collections import OrderedDict
import pdb
pdb.set_trace()
def alphaDict(words):
alphas = OrderedDict()
words = sorted(words, key = str.lower)
words = filter(None, words);
for word in words:
if word[0].upper() not in alphas:
alphas[word[0].upper()] = []
alphas[word[0].upper()].append(word.lower())
return alphas
def listConvert(passage):
alphabets = " abcdefghijklmnopqrstuvwxyz"
for char in passage:
if char.lower() not in alphabets:
passage = passage.replace(char, "")
listConvert(passage)
passage = rDup(passage.split(" "))
return passage
def rDup(sequence):
unique = []
[unique.append(item) for item in sequence if item not in unique]
return unique
def otherPrint(word):
base = "http://dictionary.reference.com/browse/"
end = "?s=t"
from nltk.corpus import wordnet as wn
data = [s.definition() for s in wn.synsets(word)]
print("<li>")
print("<a href = '" +base+word+end+"' target = '_blank'><h2 class = 'dictlink'>" +(word.lower())+":</h2></a>")
if not data:
print("Sorry, we could not find this word in our data banks. Please click the word to check <a target = '_blank' class = 'dictlink' href = 'http://www.dictionary.com'>Dictionary.com</a>")
return
print("<ul>")
for key in data:
print("<li>"+key+"</li>")
print("</ul>")
print("</ol>")
print("</li>")
def formatPrint(word):
base = "http://dictionary.reference.com/browse/"
end = "?s=t"
from PyDictionary import PyDictionary
pd = PyDictionary()
data = pd.meaning(word)
print "<li>"
print "<a href = '" +base+word+end+"' target = '_blank'><h2 class = 'dictlink'>" +(word.lower())+":</h2></a>"
if not data:
print "Sorry, we could not find this word in our data banks. Please click the word to check <a target = '_blank' class = 'dictlink' href = 'http://www.dictionary.com'>Dictionary.com</a>"
return
print "<ol type = 'A'>"
for key in data:
print "<li><h3 style = 'color: red;' id = '" +word.lower()+ "'>"+key+"</h3><ul type = 'square'>"
for item in data[key]:
print "<li>" +item+"</li>"
print "</ul>"
print "</li>"
print "</ol>"
print "</li>"
def specPrint(words):
print "<ol>"
for word in words:
otherPrint(word)
print "</ol>"
print "<br/>"
print "<br/>"
print "<a href = '#list'> Click here</a> to go back to choose another letter<br/>"
print "<a href = '#sentence'>Click here</a> to view your sentence.<br/>"
print "<a href = '#header'>Click here</a> to see the owner's information.<br/>"
print "<a href = '../html/main.html'>Click here</a> to go back to the main page."
print "</div>"
for x in range(0, 10):
print "<br/>"
To all those who answered my previous question, thank you. It worked, I will be accepting an answer soon. However, I have another problem. When I try to import wordnet in a shell (by compiling and IDLE commands), the process works fine. However, on xampp, I get this error:
Can someone please explain this as well? Thanks!
Your for loop is not indented in other loop -
for key in data:
print("<li>"+key+"</li>")
print("</ul>")
print("</ol>")
print("</li>")
This is most probably the issue. Try indenting it-
for key in data:
print("<li>"+key+"</li>")
print("</ul>")
print("</ol>")
print("</li>")
Also, please understand that python treats tabs and spaces differently, so assuming you indent one line using tab and then next line using 4 spaces (manual spaces) it would cause indentation error in Python. You have to either use all spaces or all tabs , you cannot use a mixture of both (even though they look the same).
A couple of things. First is the indent of line one. That may just be copying here.
Then every time you have a colon, you need to have the next line indented. So in the otherPrint function you have this:
for key in data:
print("<li>"+key+"</li>")
print("</ul>")
print("</ol>")
print("</li>")
At least the first line needs to be indented. If you intend all of the prints to be in the loop then you need to indent all of them.
You also have the same issue with you if statements in formatPrint function. Try indenting them under the loops and conditionals and this should clear it up. If you are still finding a problem, then check to make sure you have the correct number of parentheses and brackets closing out statements. Leaving one off will cause the rest of the code to go wonky.
Also your are using print statements instead of the print() function. The print statement no longer works in Python 3.x... you have to enclose all of that in parentheses.
def formatPrint(word):
base = "http://dictionary.reference.com/browse/"
end = "?s=t"
from PyDictionary import PyDictionary
pd = PyDictionary()
data = pd.meaning(word)
print("<li>")
print(
"<a href = '" +base+word+end+"' target = '_blank'>
<h2 class = 'dictlink'>" +(word.lower())+":</h2></a>"
)
if not data:
print(
"Sorry, we could not find this word in our data banks.
Please click the word to check <a target = '_blank'
class = 'dictlink' href
='http://www.dictionary.com'>Dictionary.com</a>"
)
return
print("<ol type = 'A'>")
for key in data:
print(
"<li><h3 style = 'color: red;' id = '" +word.lower()+
"'>"+key+"</h3><ul type = 'square'>"
)
for item in data[key]:
print("<li>" +item+"</li>")
print("</ul>")
print("</li>")
print("</ol>")
print("</li>")

Parsing Status JSON from Twitter using Python to find statuses count

I am using Tweepy to get all tweets made by #UserName. This is the following code
import urllib, json
import sys
import tweepy
from tweepy import OAuthHandler
def twitter_fetch(screen_name = "prateek",maxnumtweets=10):
consumer_token = "" #Keys removed for security
consumer_secret = ""
access_token = ""
access_secret = ""
auth = tweepy.OAuthHandler(consumer_token,consumer_secret)
auth.set_access_token(access_token,access_secret)
api = tweepy.API(auth)
for status in tweepy.Cursor(api.user_timeline,id=screen_name).items(1):
print status['statuses_count']
print '\n'
if __name__ == '__main__':
twitter_fetch('BarackObama',200)
How do I parse the JSON properly to read the Number of statuses made by that particular user ?
How about something that keeps track of how many statuses you've iterated through? I'm not positive how tweepy works, but using something like this:
statuses = 0
for status in tweepy.Cursor(api.user_timeline,id=screen_name).items(1):
print status['statuses_count']
statuses += 1
print '\n'
return statuses
Usually JSON data has a nice structure, with clear formatting like the following, making it easier to understand.
So when I want to iterate through this list to find if an x exists (achievement, in this case), I use this function, which adds 1 to index every iteration it goes through.
def achnamefdr(appid,mykey,steamid64,achname):
playerachurl = 'http://api.steampowered.com/ISteamUserStats/GetPlayerAchievements/v0001/?appid=' + str(appid) + '&key=' + mykey + '&steamid=' + steamid64 + '&l=name'
achjson = json.loads(urllib.request.urlopen(playerachurl).read().decode('utf-8'))
achjsonr = achjson['playerstats']['achievements']
index = 0
for ach in achjsonr:
if not ach['name'].lower() == achname.lower():
index += 1
continue
else:
achnamef = ach['name']
return achnamef, index, True
return 'Invalid Achievement!', index, False
It can be done by getting the JSON object from status._json and then parsing it..
print status._json["statuses_count"]

Very fast webpage scraping (Python)

So I'm trying to filter through a list of urls (potentially in the hundreds) and filter out every article who's body is less than X number of words (ARTICLE LENGTH). But when I run my application, it takes an unreasonable amount of time, so much so that my hosting service times out. I'm currently using Goose (https://github.com/grangier/python-goose) with the following filter function:
def is_news_and_data(url):
"""A function that returns a list of the form
[True, title, meta_description]
or
[False]
"""
result = []
if url == None:
return False
try:
article = g.extract(url=url)
if len(article.cleaned_text.split()) < ARTICLE_LENGTH:
result.append(False)
else:
title = article.title
meta_description = article.meta_description
result.extend([True, title, meta_description])
except:
result.append(False)
return result
In the context of the following. Dont mind the debug prints and messiness (tweepy is my twitter api wrapper):
def get_links(auth):
"""Returns a list of t.co links from a list of given tweets"""
api = tweepy.API(auth)
page_list = []
tweets_list = []
links_list = []
news_list = []
regex = re.compile('http://t.co/.[a-zA-Z0-9]*')
for page in tweepy.Cursor(api.home_timeline, count=20).pages(1):
page_list.append(page)
for page in page_list:
for status in page:
tweet = status.text.encode('utf-8','ignore')
tweets_list.append(tweet)
for tweet in tweets_list:
links = regex.findall(tweet)
links_list.extend(links)
#print 'The length of the links list is: ' + str(len(links_list))
for link in links_list:
news_and_data = is_news_and_data(link)
if True in news_and_data:
news_and_data.append(link)
#[True, title, meta_description, link]
news_list.append(news_and_data[1:])
print 'The length of the news list is: ' + str(len(news_list))
Can anyone recommend a perhaps faster method?
This code is probably causing your slow performance:
len(article.cleaned_text.split())
This is performing a lot of work, most of which is discarded. I would profile your code to see if this is the culprit, if so, replace it with something that just counts spaces, like so:
article.cleaned_text.count(' ')
That won't give you exactly the same result as your original code, but will be very close. To get closer you could use a regular expression to count words, but it won't be quite as fast.
I'm not saying this is the most absolute best you can do, but it will be faster. You'll have to redo some of your code to fit this new function.
It will at least give you less function calls.
You'll have to pass the whole url list.
def is_news_in_data(listings):
new_listings = {}
tmp_listing = ''
is_news = {}
for i in listings:
url = listings[i]
is_news[url] = 0
article = g.extract(url=url).cleaned_text
tmp_listing = '';
for s in article:
is_news[url] += 1
tmp_listing += s
if is_news[url] > ARTICLE_LENGTH:
new_listings[url] = tmp_listing
del is_news[url]
return new_listings

Categories

Resources