urlopen for loop with beautifulsoup - python

New user here. I'm starting to get the hang of Python syntax but keep getting thrown off by for loops. I understand each scenario I've reach on SO thus far (and my previous examples), but can't seem to come up with one for my current scenario.
I am playing around with BeautifulSoup to extract features from app stores as an exercise.
I created a list of both GooglePlay and iTunes urls to play around with.
list = {"https://play.google.com/store/apps/details?id=com.tov.google.ben10Xenodromeplus&hl=en",
"https://play.google.com/store/apps/details?id=com.doraemon.doraemonRepairShopSeasons&hl=en",
"https://play.google.com/store/apps/details?id=com.KnowledgeAdventure.SchoolOfDragons&hl=en",
"https://play.google.com/store/apps/details?id=com.turner.stevenrpg&hl=en",
"https://play.google.com/store/apps/details?id=com.indigokids.mimdoctor&hl=en",
"https://play.google.com/store/apps/details?id=com.rovio.gold&hl=en",
"https://itunes.apple.com/us/app/angry-birds/id343200656?mt=8",
"https://itunes.apple.com/us/app/doodle-jump/id307727765?mt=8",
"https://itunes.apple.com/us/app/tiny-wings/id417817520?mt=8",
"https://itunes.apple.com/us/app/flick-home-run-!/id454086751?mt=8",
"https://itunes.apple.com/us/app/bike-race-pro/id510461370?mt=8"}
To test out beautifulsoup (bs in my code), I used one app for each store:
gptest = bs(urllib.urlopen("https://play.google.com/store/apps/details?id=com.rovio.gold&hl=en"))
ios = bs(urllib.urlopen("https://itunes.apple.com/us/app/doodle-jump/id307727765?mt=8"))
I found an app's category on iTunes using:
print ios.find(itemprop="applicationCategory").get_text()
...and on Google Play:
print gptest.find(itemprop="genre").get_text()
With this newfound confidence, I wanted to try to iterate through my entire list and output these values, but then I realized I suck at for loops...
Here's my attempt:
def opensite():
for item in list:
bs(urllib.urlopen())
for item in list:
try:
if "itunes.apple.com" in row:
print "Category:", opensite.find(itemprop="applicationCategory").get_text()
else if "play.google.com" in row:
print "Category", opensite.find(itemprop="genre").get_text()
except:
pass
Note: Ideally I'd be passing a csv (called "sample" with one column "URL") so I believe my loop would start with
for row in sample.URL:
but I figured it was more helpful to show you a list rather than deal with a data frame.
Thanks in advance!

from __future__ import print_function #
try: #
from urllib import urlopen # Support Python 2 and 3
except ImportError: #
from urllib.request import urlopen #
from bs4 import BeautifulSoup as bs
for line in open('urls.dat'): # Read urls from file line by line
doc = bs(urlopen(line.strip()), 'html5lib') # Strip \n from url, open it and parse
if 'apple.com' in line:
prop = 'applicationCategory'
elif 'google.com' in line:
prop = 'genre'
else:
continue
print(doc.find(itemprop=prop).get_text())

Try this for reading urls from list:
from bs4 import BeautifulSoup as bs
import urllib2
import requests
list = {"https://play.google.com/store/apps/details?id=com.tov.google.ben10Xenodromeplus&hl=en",
"https://play.google.com/store/apps/details?id=com.doraemon.doraemonRepairShopSeasons&hl=en",
"https://play.google.com/store/apps/details?id=com.KnowledgeAdventure.SchoolOfDragons&hl=en",
"https://play.google.com/store/apps/details?id=com.turner.stevenrpg&hl=en",
"https://play.google.com/store/apps/details?id=com.indigokids.mimdoctor&hl=en",
"https://play.google.com/store/apps/details?id=com.rovio.gold&hl=en",
"https://itunes.apple.com/us/app/angry-birds/id343200656?mt=8",
"https://itunes.apple.com/us/app/doodle-jump/id307727765?mt=8",
"https://itunes.apple.com/us/app/tiny-wings/id417817520?mt=8",
"https://itunes.apple.com/us/app/flick-home-run-!/id454086751?mt=8",
"https://itunes.apple.com/us/app/bike-race-pro/id510461370?mt=8"}
def opensite():
for item in list:
bs(urllib2.urlopen(item),"html.parser")
source = requests.get(item)
text_new = source.text
soup = bs(text_new, "html.parser")
try:
if "itunes.apple.com" in item:
print item,"Category:",soup.find('span',{'itemprop':'applicationCategory'}).text
elif "play.google.com" in item:
print item,"Category:", soup.find('span',{'itemprop':'genre'}).text
except:
pass
opensite()
It will print like
https://itunes.apple.com/us/app/doodle-jump/id307727765?mt=8 Category: Games
https://play.google.com/store/apps/details?id=com.KnowledgeAdventure.SchoolOfDragons&hl=en Category: Role Playing
https://play.google.com/store/apps/details?id=com.tov.google.ben10Xenodromeplus&hl=en Category: Role Playing
https://itunes.apple.com/us/app/tiny-wings/id417817520?mt=8 Category: Games
https://play.google.com/store/apps/details?id=com.doraemon.doraemonRepairShopSeasons&hl=en Category: Role Playing
https://itunes.apple.com/us/app/angry-birds/id343200656?mt=8 Category: Games
https://play.google.com/store/apps/details?id=com.indigokids.mimdoctor&hl=en Category: Role Playing
https://itunes.apple.com/us/app/bike-race-pro/id510461370?mt=8 Category: Games
https://play.google.com/store/apps/details?id=com.rovio.gold&hl=en Category: Role Playing
https://play.google.com/store/apps/details?id=com.turner.stevenrpg&hl=en Category: Role Playing
https://itunes.apple.com/us/app/flick-home-run-!/id454086751?mt=8 Category: Games

Related

web scraping For loop stuck on first item on list- python

I'm trying to loop through a list of names, it seems it should be very simple but it's not working. Only receiving the first name on the list and nothing more! why is it not moving on to the NEXT name!?
Here's my code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get('http://www.aus.edu/info/200170/college_of_architecture_art_and_design/269/department_of_architecture/4')
soup = BeautifulSoup (page.content, "html.parser")
content = soup.find(class_="supContact")
content_items = content.find_all(class_="contactToggle selected")
names = content_items[0]
s_name = []
for item in name:
s_name.append(content.find(class_="contactToggle selected").text)
if not names:
continue
print(s_name)
#print(names.prettify())

Web crawler does not open all links in a page

I'am trying to build a web crawler using beautifulsoup and urllib. The crawler is working, but it does not open all the pages in a site. It opens the first link and goes to that link, opens the first link of that page and so on.
Here's my code:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urljoin
import json, sys
sys.setrecursionlimit(10000)
url = input('enter url ')
d = {}
d_2 = {}
l = []
url_base = url
count = 0
def f(url):
global count
global url_base
if count <= 100:
print("count: " + str(count))
print('now looking into: '+url+'\n')
count += 1
l.append(url)
html = urlopen(url).read()
soup = BeautifulSoup(html, "html.parser")
d[count] = soup
tags = soup('a')
for tag in tags:
meow = tag.get('href',None)
if (urljoin(url, meow) in l):
print("Skipping this one: " + urljoin(url,meow))
elif "mailto" in urljoin(url,meow):
print("Skipping this one with a mailer")
elif meow == None:
print("skipping 'None'")
elif meow.startswith('http') == False:
f(urljoin(url, meow))
else:
f(meow)
else:
return
f(url)
print('\n\n\n\n\n')
print('Scrapping Completed')
print('\n\n\n\n\n')
The reason you're seeing this behavior is due to when the code recursively calls your function. As soon as the code finds a valid link, the function f gets called again preventing the rest of the for loop from running until it returns.
What you're doing is a depth first search, but the internet is very deep. You want to do a breadth first search instead.
Probably the easiest way to modify your code to do that is to have a global list of links to follow. Have the for loop append all the scraped links to the end of this list and then outside of the for loop, remove the first element of the list and follow that link.
You may have to change your logic slightly for your max count.
If count reaches 100, no further links will be opened. Therefore I think you should decrease count by one after leaving the for loop. If you do this, count would be something like the current link depth (and 100 would be the maximum link depth).
If the variable count should refer to the number of opened links, then you might want to control the link depth in another way.

For loops with user input on Python

Hello I'm learning how to parse HTML with BeautifulSoup. I would like to know if it is possible to use a user input in a for loop, as:
for (user input) in A
As A is a list of links so the user can choose to go for a link, using an input.
And then I use urllib to open that link and repeat the process.
You can use something like this:
import urllib2
from bs4 import BeautifulSoup
choice = ''
for url in urls:
print('Go to {}?'.format(url))
decision = input('Y/n ')
if decision == 'Y':
choice = url
break
if choice:
r = urllib2.urlopen(choice).read()
soup = BeautifulSoup(r, 'lxml')
# do something else
It wasn't exactly clear to me if you really wanted to "open" the link in a browser, so I included some code to do that. Is this maybe what you wanted from "digit a position"?
tl;dr
print("Which URL would you like to open?"
" (Please select an option between 1-{})".format(len(A)))
for index, link in enumerate(A):
print index+1, link
Full:
from bs4 import BeautifulSoup
import requests
import webbrowser
A = [
'https://www.google.com',
'https://www.stackoverflow.com',
'https://www.xkcd.com',
]
print("Which URL would you like to open?"
" (Please select an option between 1-{})".format(len(A)))
for index, link in enumerate(A):
print index+1, link
_input = input()
try:
option_index = int(_input) - 1
except ValueError:
print "{} is not a valid choice.".format(_input)
raise
try:
selection = A[option_index]
except IndexError:
print "{} is not a valid choice.".format(_input)
raise
webbrowser.open(selection)
response = requests.get(selection)
html_string = response.content
# Do parsing...
Thanks for your help. I achieved a solution on this.
Created two variables: count = input() and postion = input()
The count I have used in a for loop: for _ in range(c) - with this I can made a process repeat the number of times that the user want (on this assignement is 4).
The position (that for this assignement is predefined on 3), I use for list index, in a list with all url. So for open the url in position 3 I have:
url = links[p-1] (-1 for the reason that user inputs 3, but the list index starts with 0 (0,1,2...)
And then I can use urllib.request.urlopen.read()

BeautifulSoup returning unrelated HTML

I'm trying to parse basketball stat data from pages like http://www.sports-reference.com/cbb/boxscores/2014-11-14-kentucky.html. I'm using Python 2.7.6 and BeautifulSoup 4-4.3.2. I'm searching gamelogs like the above page for the class "sortable" in order to get access to the raw stat data contained within the tables. I am only interested in the "Basic Stats" for each team.
However, the HTML that BeautifulSoup is returning is not at all what I expect. Instead I get a list of all-time team records and data for every school that has ever played. I don't have enough reputation to post a second link here of the output or I would.
Basically, there are four class "sortable" tables on the boxscore page. When I ask BS to find them by the only way I can think of to distinguish them from the other data, it instead returns completely irrelevant data and I can't even figure out where the returned data comes from.
Here's the code:
import urllib2
import re
import sys
from bs4 import BeautifulSoup
class Gamelogs():
def __init__(self):
#the base bage that has all boxscore links
self.teamPageSoup = BeautifulSoup(urllib2.urlopen(
'http://www.sports-reference.com/cbb/schools/' + school +
'/2015-gamelogs.html'))
#use regex to only find links with score data
self.statusPageLinks = self.teamPageSoup.findAll(href=re.compile(
"boxscores"));
def scoredata(links, school):
#for each link in the school's season
for l in links:
gameSoup = BeautifulSoup(urllib2.urlopen(l))
#remove extra link formatting to get just filename alone
l = l[59+len(school):]
#open a local file with that filename to store the results
fo = open(str(l),"w")
#create a list that will hold the box score data only
output = gameSoup.findAll(class_="sortable")
#write it line by line to the file that was just opened
for o in output:
fo.write(str(o) + '\n')
fo.close
def getlinks(school):
gamelogs = Gamelogs()
#open a new file to store the output
fo = open(school + '.txt',"w")
#remove extraneous links
gamelogs.statusPageLinks = gamelogs.statusPageLinks[2:]
#create the list that will hold each school's seasonlong boxscores
boxlinks = list()
for s in gamelogs.statusPageLinks:
#make the list element a string so it can be sliced
string = str(s)
#remove extra link formatting
string = string[9:]
string = string[:-16]
#create the full list of games per school
boxlinks.insert(0, 'http://www.sports-reference.com/cbb/schools/'
+ school + string)
scoredata(boxlinks, school)
if __name__ == '__main__':
#for each school as a commandline argument
for arg in sys.argv[1:]:
school = arg
getlinks(school)
Is this a problem with BS, my code, or the site? T
It looks like this is an issue with your code. The page that you are getting back sounds like this one: http://www.sports-reference.com/cbb/schools/?redir
Whenever I enter an invalid school name I am redirected to a page showing stats for 477 different teams. FYI: team names in the url are also case sensitive.

Finding information on a website without an external module

I am creating a program in Python where you search up a tv show/movie, and from IMDb, it gives you:
The title, year, rating, age rating, and synopsis of the movie.
I want to use no external modules at all, only the ones that come with Python 3.4.
I know I will have to use urllib, but I do not know where to go from there.
How would I do this?
This is an example taken from here:
import json
from urllib.parse import quote
from urllib.request import urlopen
def search(title):
API_URL = "http://www.omdbapi.com/?r=json&s=%s"
title = title.encode("utf-8")
url = API_URL % quote(title)
data = urlopen(url).read().decode("utf-8")
data = json.loads(data)
if data.get("Response") == "False":
print(data.get("Error", "Unknown error"))
return data.get("Search", [])
Then you can do:
>>> search("Idiocracy")
[{'Year': '2006', 'imdbID': 'tt0387808', 'Title': 'Idiocracy'}]
It's maybe too complex but:
I look at the webpage code. I look where the info I want is and then I extract the info.
import urllib.request
def search(title):
html = urllib.request.urlopen("http://www.imdb.com/find?q="+title).read().decode("utf-8")
f=html.find("<td class=\"result_text\"> <a href=\"",0)+34
openlink=""
while html[f]!="\"":
openlink+= html[f]
f+=1
html = urllib.request.urlopen("http://www.imdb.com"+openlink).read().decode("utf-8")
f = html.find("<meta property='og:title' content=\"",0)+35
titleyear=""
while html[f] !="\"":
titleyear+=html[f]
f+=1
f = html.find("title=\"Users rated this ",0)+24
rating = ""
while html[f] !="/":
rating+= html[f]
f+=1
f=html.find("<meta name=\"description\" content=\"",0)+34
shortdescription = ""
while html[f] !="\"":
shortdescription+=html[f]
f+=1
print (titleyear,rating,shortdescription)
return (titleyear,rating,shortdescription)
search("friends")
The number adding to f has to be just right, you count the lenght of the string you are searching, because find() returns you the position of the first letter in the string.
It looks bad, is there any other simpler way to do it?

Categories

Resources