I'm very new to Python so this is probably straightforward, and might be an indentation issue. I'm trying to scrape over several webpages using beautiful soup, creating a list of dictionaries that I can use afterwards to manipulate the data.
The code seems to work fine, but the list I end up with (liste_flat) is just a list of the same two dictionaries. I want a list of different dictionaries.
def scrap_post(url):
url = "https://www.findproperly.co.uk/property-to-rent-london/commute/W3siaWQiOjkxMDYsImZyZXEiOjUsIm1ldGgiOiJwdWJ0cmFucyIsImxuZyI6LTAuMTI0Nzg5LCJsYXQiOjUxLjUwODR9XQ==/max-time/90/page/".format(i)
dictionary = {}
response = requests.get(url)
soup = bs(response.text,"lxml")
taille = len(soup.find_all("div", class_="col-sm-6 col-md-4 col-lg-3 pl-grid-prop not-viewed ")) #48 entries
for num_ville in range(0,taille):
print(num_ville)
apt_id = soup.find_all("div", class_="col-sm-6 col-md-4 col-lg-3 pl-grid-prop not-viewed ")[num_ville]['data-id']
entry = soup.find_all("div", class_="col-sm-6 col-md-4 col-lg-3 pl-grid-prop not-viewed ")[num_ville]
pricepw = soup.find_all('div', class_='col-xs-5 col-sm-4 price')[num_ville].find('h3').text.encode('utf-8').replace('\xc2\xa3','',).replace('pw','',).strip()
rooms = soup.find_all('div', class_='col-xs-6 type')[num_ville].find('p').text.encode('utf-8').strip()
lat = soup.find_all('div', {"itemprop":"geo"})[num_ville].find('meta', {'itemprop':'latitude'})['content']
lon = soup.find_all('div', {"itemprop":"geo"})[num_ville].find('meta', {'itemprop':'longitude'})['content']
dictionary[num_ville]={'Price per week':pricepw,'Rooms':rooms,'Latitude':lat,'Longitude':lon}
return dictionary
#get all URLs
liste_url = []
liste_url = ['https://www.findproperly.co.uk/property-to-rent-london/commute/W3siaWQiOjkxMDYsImZyZXEiOjUsIm1ldGgiOiJwdWJ0cmFucyIsImxuZyI6LTAuMTI0Nzg5LCJsYXQiOjUxLjUwODR9XQ==/max-time/90/page/''%i' %i for i in range(1,3)]
#get flats
liste_flat = [scrap_post(i) for i in liste_url]
I must somehow be looping over the same website twice. Any advice on how to make sure I'm looping over different websites?
Thanks!
Yes, you are looping over the same website, because you have hardcoded the url variable in your function.
url = "https://www.findproperly.co.uk/property-to-rent-london/commute/W3siaWQiOjkxMDYsImZyZXEiOjUsIm1ldGgiOiJwdWJ0cmFucyIsImxuZyI6LTAuMTI0Nzg5LCJsYXQiOjUxLjUwODR9XQ==/max-time/90/page/".format(i)
Meaning regardless of what you send to the function, it will always use this url. You might want to remove that. You also haven't placed a placeholder in your string and the .format(i) essentially does nothing.
Related
Hello hope you're all doing well I wrote the following program to extract the data from it's source but it gives me the last value and I don't know why
users_list = soup.find_all("li", class_= "ContentGrid-gridItem-2Ad e2e-ContentGriditem")
for user in users_list:
name = user.find("span", class_ = "e2e-UserSummary-displayName")
profile_link = user.find('a', class_='UserSummary-ownerLink-1cJ')
the variable profile_link gives me this output
<a class="UserSummary-ownerLink-1cJ"
href="https://www.behance.net/baianat?tracking_source=search_users"
target="_blank"></a>
i want to make it return only the link
another thing also I built other variables to return values but it keep returning the same values because it's having the same class and they all in h4 here's the code and it's returning the value of the first object which is appreciations 189.9k
appreciations = users.find("h4", class_= "UserSummaryStats-statAmount-13R").text
followers = users.find("h4", class_= "UserSummaryStats-statAmount-13R").text
project_views = users.find("h4", class_= "UserSummaryStats-statAmount-13R").text
To extract only href attribute of the tag you can use tag['<attr>']. For example:
profile_link = user.find('a', class_='UserSummary-ownerLink-1cJ')['href']
For the second case, just use .find_all(<tag_name>, <params>) method which returns all the tags with defined parameters as a list:
user_data = users.find_all("h4", class_= "UserSummaryStats-statAmount-13R")
for entry in user_data:
# do something with each entry
I am new to Python. I am trying to fetch URLs from a page, which has 18 URLs in it, in a DIV with all having the same class name. Below is the code I have used. When I use the below code without a return statement. then it gives all 18 URLs on the page. I have to return these URLs and when I am doing so, it's only returning one URL.
URL = 'https://www.example.com/destinations/'
def make_soup(URL):
response = requests.get(URL)
soup = BeautifulSoup(response.content, 'html.parser')
return soup
def get_new_urls(soup):
for links in soup.find_all("div", class_="col-sm-2 col-md-2 col-lg-2 col-xl-2 col-xs-12 col-6 p0 mb-25"):
dlinks=links.a['href']
return dlinks
make_soup(URL)
get_new_urls(soup)
Pls, help me find solution to this problem. Thanks in advance!
You need to collect the results in another object. First create the list, use .append to add to it, then return the new, populated list.
def get_new_urls(soup):
dlinks = [] # create empty list to collect results
for links in soup.find_all("div", class_="col-sm-2 col-md-2 col-lg-2 col-xl-2 col-xs-12 col-6 p0 mb-25"):
dlinks.append(links.a['href']) # add results to the list
return dlinks # return the full list
I am trying to find a way to loop through URLs and scrape paginated tables in each of them. The issue arises when some URLs have differing page numbers (in some cases there is no table!). Can someone explain to me where I went wrong and how to fix this? (Please let me know if you require further info.)
def get_injuries(pages):
Injuries_list = []
for page in range(1, pages+1):
for player_id in range(1,10):
headers = {"User-Agent":"Mozilla/5.0"}
url = 'https://www.transfermarkt.co.uk/neymar/verletzungen/spieler/' + str(player_id)
print(url)
html = requests.get(url, headers = headers)
soup = bs(html.content)
# Select first table
if soup.select('.responsive-table > .grid-view > .items > tbody'):
soup = soup.select('.responsive-table > .grid-view > .items > tbody')[0]
try:
for cells in soup.find_all(True, {"class": re.compile("^(even|odd)$")}):
Season = cells.find_all('td')[1].text
Strain = cells.find_all('td')[2].text
Injury_From = cells.find_all('td')[3].text
Injury_To = cells.find_all('td')[4].text
Duration_days = cells.find_all('td')[5].text
Games_missed = cells.find_all('td')[6].text
Club_affected = cells.find_all('td')[6].img['alt']
player = {
'name': cells.find_all("h1", {"itemprop": "name"}),
'Season': Season,
'Strain': Strain,
'Injury_from': Injury_From,
'Injury_To': Injury_To,
'Duration (days)': Duration_days,
'Games_Missed': Games_missed,
'Club_Affected': Club_affected
}
players_list.append(player)
except IndexError:
pass
return Injuries_list
return Injuries_list
This should be in the outer most for loop. After looping once it's returning only one url.
Only one for loop is sufficient to get the data.
players_list=[] .I dont' see this, Create one at the start
you are not doing anything with this list Injuries_list. It's returning an empty list
I've scoured the questions/answers and have attempted to implement changes to the following, but to no avail.
I'm trying to scrape pages of course listings from Coursera's "Data Analysis" results, https://www.coursera.org/browse/data-science/data-analysis?languages=en&page=1.
There are 9 pages, each with 25 courses, and each course is under its own <h2> tag. I've found some success with the following code, but it has not been consistent:
courses_data_sci = []
for i in range(10):
page = "https://www.coursera.org/browse/data-science/data-analysis? languages=en&page=" + str(i)
html = urlopen(page)
soup = BeautifulSoup(html.read(), "html.parser")
for meta in soup.find_all('div', {'id' : 'rendered-content'}):
for x in range(26):
try:
course = meta.find_all('h2')[x].text.strip()
courses_data_sci.append(course)
except IndexError:
pass
This code seems to return the first 2-3 pages of results and the last page of results; sometimes, if I run it again after clearning courses_data_sci, it will return the 4th page of results a few times. (I'm working in Jupyter, and I've restarted the kernel to account for any issues there.)
I'm not sure why the code isn't working correctly, let alone why it is returning inconsistent results.
Any help is appreciated. Thank you.
UPDATE
Thanks for the ideas...I am trying to utilize both to make the code work.
Just out of curiosity, I pared down the code to see what it was picking up, with both comments in mind.
courses_data_sci = []
session = requests.Session()
for i in range(10):
page = "https://www.coursera.org/browse/data-science/data-analysis? languages=en&page=" + str(i)
html = urlopen(page)
soup = BeautifulSoup(html.read(), "html.parser")
for meta in soup.find_all('div', {'id' : 'rendered-content'}):
course = meta.find_all('h2')
courses_data_sci.append(course)
# This is to check length of courses_data_sci across pages
print('Page: %s -- total length %s' % (i, len(courses_data_sci)))
This actually results in a list of lists, which does contain all the courses throughout the 9 pages (and, of course, the href info since it isn't being stripped yet). Each loop creates one list per page: a list of all the courses on the respective page. So it appears that I should be able to strip the href while the lists are being pushed to the list, courses_data_sci.
There are 2 <h2> tags per course, so I'm also thinking there could be an issue with the second range() call: for x in range(26). I've tried multiple different ranges, none of which work or which return an error, "index out of range".
I get the same behaviour using your code.
I changed it in order to use requests:
from bs4 import BeautifulSoup
import requests
courses_data_sci = []
session = requests.Session()
for i in range(10):
page = "https://www.coursera.org/browse/data-science/data-analysis?languages=en&page=" + str(i)
html = session.get(page)
soup = BeautifulSoup(html.text, "html.parser")
for meta in soup.find_all('div', {'id' : 'rendered-content'}):
for x in range(26):
try:
course = meta.find_all('h2')[x].text.strip()
courses_data_sci.append(course)
except IndexError:
pass
# This is to check length of courses_data_sci across pages
print('Page: %s -- total length %s' % (i, len(courses_data_sci)))
I need to scrape all 'a' tags with "result-title" class, and all 'span' tags with either class 'results-price' and 'results-hood'. Then, write the output to a .csv file across multiple columns. The current code does not print anything to the csv file. This may be bad syntax but I really can't see what I am missing. Thanks.
f = csv.writer(open(r"C:\Users\Sean\Desktop\Portfolio\Python - Web Scraper\RE Competitor Analysis.csv", "wb"))
def scrape_links(start_url):
for i in range(0, 2500, 120):
source = urllib.request.urlopen(start_url.format(i)).read()
soup = BeautifulSoup(source, 'lxml')
for a in soup.find_all("a", "span", {"class" : ["result-title hdrlnk", "result-price", "result-hood"]}):
f.writerow([a['href']], span['results-title hdrlnk'].getText(), span['results-price'].getText(), span['results-hood'].getText() )
if i < 2500:
sleep(randint(30,120))
print(i)
scrape_links('my_url')
If you want to find multiple tags with one call to find_all, you should pass them in a list. For example:
soup.find_all(["a", "span"])
Without access to the page you are scraping, it's too hard to give you a complete solution, but I recommend extracting one variable at a time and printing it to help you debug. For example:
a = soup.find('a', class_ = 'result-title')
a_link = a['href']
a_text = a.text
spans = soup.find_all('span', class_ = ['results-price', 'result-hood'])
row = [a_link, a_text] + [s.text for s in spans]
print(row) # verify we are getting the results we expect
f.writerow(row)