i wrote this code to get Ebay prices
it asks for full ebay link then it writes its price
import bs4 , requests
print('please enter full Ebay link ..')
link = str(input())
def ebayprice(url):
res = requests.get(link)
res.raise_for_status()
txt = bs4.BeautifulSoup(res.text , 'html.parser')
csselement = txt.select('#mm-saleDscPrc')
return csselement[0].text.strip()
price = ebayprice(link)
print('price is : '+ price)
i want to improve it and i tried my best and i couldnt
i want it to take multiple links and run them one by one and it should write results each time
it doesnt matter if links are from input() or from links = 'www1,www2,www3'
you can split by comma and iterate over the list using a for loop:
def ebayprice(url):
...
for single_link in link.split(','):
price = ebayprice(single_link)
print('price for {} is {}'.format(single_link, price))
if you want you can ask for how many links, someone want to scrape, and after that you can use for loop statment to go through every url
import bs4 , requests
# ask how many links he will pass
print('How many links do you want wo scrape ?')
link_numb = int(input())
# get the links
print('please enter full Ebay link ..')
links = [input() for _ in range(link_numb)]
def ebayprice(link):
res = requests.get(link)
res.raise_for_status()
txt = bs4.BeautifulSoup(res.text , 'html.parser')
csselement = txt.select('#mm-saleDscPrc')
return csselement[0].text.strip()
for link in links:
price = ebayprice(link)
print(price)
Example:
How many links do you want wo scrape ?
2
please enter full Ebay link ..
http://example.com
http://example-just-test.com
# simple print the url
http://example.com
http://example-just-test.com
Related
I am trying to scrape a web application and I want to print only the href links that have the text "Show on diagram" associated with it.
here is a screenshot of the html i am trying to get to print:
web application code
here is my python code i am using:
import webbrowser
from bs4 import BeautifulSoup
import requests
def cllicheck(clli): #checks to see if there is exactly 8 characters in the clli input
#print "in the clli checker"
if len(clli) == 8:
return True
else:
print('Invalid CLLI')
return
def drawing(CLLI):
if cllicheck(CLLI) == True:
CLLIstate = CLLI[4:6]
#print "------- Netviz", NetvizCLLI, "was entered ----------"
resultStr = 'https://gpn.prod.att.com/netTerrain/Search?q=' + CLLI + '&subDiagrams=False&exactMatch=False&filter=Title'
webbrowser.open(resultStr)
r = requests.get(resultStr)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.find_all('a', 'Show on diagram', href=True):
print (link['href'])
main()
else:
print ('Please Enter 8 Characters')
main()
def main ():
#Beginning function to prompt user for CLLI
print ('\nPlease Enter CLLI.\n')
CLLI = input()
drawing(CLLI)
main() #This begins the GPN Program
when i run this, it prints nothing out.
I am using this portion of code in hopes of returning the href link:
for link in soup.find_all('a', 'Show on diagram', href=True):
print (link['href'])
I am expecting the link shown on the web screenshot to be displayed/printed when i run this code
I am wrote an answer, because the answer is already in the comments by #Barmar.
You need to use string='Show on diagram' https://www.crummy.com/software/BeautifulSoup/bs4/doc/#the-string-argument.
for link in soup.find_all('a', string='Show on diagram', href=True):
print(link['href'])
Im a complete beginner with python, and I wrote this program to scrape and find closing odds of NHL games off The Score website, and put that data in a file. The program runs but for some reason only 2 games out of the about 200 I tried show up with incorrect data.
I think it is because of how I am search through divs within a div, I wrote the code that returns the data from that in a way that it only stores that last div (which conveniently is the div im looking to scrape).
Also im sure my way of writing to the file is poor for runtime, is there a better way to do this?
import requests
from bs4 import BeautifulSoup
# Function to scrape web and find the game title and closing odds
def get_match_data(url_val):
# Set up html parser
response = requests.get(url_val)
html = response.text
soup = BeautifulSoup(response.content, "html.parser")
# Scrape for header which is "matchtitle"
matchtitle = soup.find('h1',{'class': "sr-only"})
# Code to find div and search for div within
divs = soup.find('div',{'class': 'col-sm-4'})
for tag in divs:
# find div
target = tag.find_all("div", {"class","GameDetailsCard__row--3rKYp"})
for tag in target:
# find divs within target div
odds = tag.find("div", {"class","GameDetailsCard__content--2L_KF"})
# Call write_to_file -> add data scraped from web
write_to_file(matchtitle.text +" "+ odds.text)
# Code to pass multiple urls to scrape for different games
def multi_games_url_handler(link):
for x in range(26500, 26715):
#print(x)
url = link + str(x)
#print(url)
get_match_data(url)
def write_to_file(game_data):
file = open("NHL_GAMES.txt","a")
file.write(game_data +"\n")
file.close
### Main(void) ?? idk what to call this portion of code not a python savant
# Fetch the webpage
link = "https://www.thescore.com/nhl/events/"
multi_games_url_handler(link)
Here is one line in the text file with correct data:
Toronto Maple Leafs # New Jersey Devils on November 24, 2022 NJD -140, o/u 6.5
Here is one with incorrect data
Carolina Hurricanes # Anaheim Ducks on December 7, 2022 Justin St. Pierre, Chris Lee
Only 2/215 were wrong like this.
It looks like that certain NHL game webpages ex: Carolina does not contain a <div> section for the 'Odds', this might be due to then being OT games? Regardless best bet is to add in a clause to handle 'no odds found'. I have updated some of your code below:
import requests
from bs4 import BeautifulSoup
# Function to scrape web and find the game title and closing odds
def get_match_data(url_val):
results = []
# Set up html parser
response = requests.get(url_val)
html = response.text
soup = BeautifulSoup(html, "html.parser")
# Scrape for header which is "matchtitle"
matchtitle = soup.find('h1',{'class': "sr-only"})
target = soup.find_all("div", {"class","GameDetailsCard__row--3rKYp"})
for tag in target:
if "Odds" in str(tag.find("div", {"class":"GameDetailsCard__label--iBMhJ"})):
odds = str(tag.find("div", {"class":"GameDetailsCard__content--2L_KF"}).text)
else:
odds = "No Odds found!"
print(matchtitle.text + " " + odds)
results.append(matchtitle.text + " " + odds)
# Call write_to_file -> add data scraped from web
write_to_file(results)
# Code to pass multiple urls to scrape for different games
def multi_games_url_handler(link):
print("Getting game details...")
for x in range(26500, 26715):
#print(x)
url = link + str(x)
#print(url)
get_match_data(url)
def write_to_file(game_data):
with open("NHL_GAMES.txt", "a") as file:
for line in game_data:
file.write(line + "\n")
### Main(void) ?? idk what to call this portion of code not a python savant
# Fetch the webpage
link = "https://www.thescore.com/nhl/events/"
multi_games_url_handler(link)
I am trying to scrape product's data when given a keyword/search_term and so far, I have managed to scrape all data from the first to the last page.
However, I wanted to change it in a way that I scrape just the first 100 or 150 products which I'm not sure how to do.
I reckon I need some integer value that keeps track of how many items I am scraping and stop when the integer gets to 100 or 150.
I know that I need to change something on the "for page in range (1, last_page)" but I've tried and ended up getting 100 same results for each of the item which isn't what I'm supposed to do.
def main(search_term):
# RUN MAIN PROGRAM ROUTINE
chromedriver = "path to chromedriver"
driver = webdriver.Chrome(chromedriver)
records = []
url = get_url(search_term)
driver.get(url)
last_page = int(driver.find_element_by_xpath('(//div[#class="a-text-center"]/ul/li)[last()-1]').text) + 1
# NUMBER OF PAGES TO CRAWL
for page in range(1, last_page):
driver.get(url.format(page))
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
print(page)
for item in results:
record = extract_record(item)
if record:
records.append(record)
driver.close()
# Run the main function given a keyword
main("make-up")
# leads to https://www.amazon.com/s?k=cosmetics&ref=nb_sb_noss
#main("iphone")
How would I go on about changing it so that I can scrape the first 100, 150 or whatever the number I want to scrape?
So you have to check if the record length is 100 and then break from the outer for loop.
for page in range(1, last_page):
driver.get(url.format(page))
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
print(page)
find=False
for item in results:
records.append(item)
if len(records)==100:
find=True
break
if find:
break
I am a beginner in web scraping, and I need help with this problem.
The website, allrecipes.com, is a website where you can find recipes based on a search, which in this case is 'pie':
link to the html file:
'view-source:https://www.allrecipes.com/search/results/?wt=pie&sort=re'
(right click-> view page source)
I want to create a program that takes a input, searches it up on allrecipes, and returns a list with tuples of the first five recipes with data such as the time that takes to make, serving yield, ingrediants, and more.
This is my program so far:
import requests
from bs4 import BeautifulSoup
def searchdata():
inp=input('what recipe would you like to search')
url ='http://www.allrecipes.com/search/results/?wt='+str(inp)+'&sort=re'
r=requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
links=[]
#fill in code for finding top 3 or five links
for i in range(3)
a = requests.get(links[i])
soupa = BeautifulSoup(a.text, 'html.parser')
#fill in code to find name, ingrediants, time, and serving size with data from soupa
names=[]
time=[]
servings=[]
ratings=[]
ingrediants=[]
searchdata()
Yes, i know, my code is very messy but What should I fill in in the two code fill-in areas?
Thanks
After searching for the recipe you have to get the links of each recipe and then request again for each of those links, because the information you're looking for is not available on the search page. That would not look clean without OOP so here's the class I wrote that does what you want.
import requests
from time import sleep
from bs4 import BeautifulSoup
class Scraper:
links = []
names = []
def get_url(self, url):
url = requests.get(url)
self.soup = BeautifulSoup(url.content, 'html.parser')
def print_info(self, name):
self.get_url(f'https://www.allrecipes.com/search/results/?wt={name}&sort=re')
if self.soup.find('span', class_='subtext').text.strip()[0] == '0':
print(f'No recipes found for {name}')
return
results = self.soup.find('section', id='fixedGridSection')
articles = results.find_all('article')
texts = []
for article in articles:
txt = article.find('h3', class_='fixed-recipe-card__h3')
if txt:
if len(texts) < 5:
texts.append(txt)
else:
break
self.links = [txt.a['href'] for txt in texts]
self.names = [txt.a.span.text for txt in texts]
self.get_data()
def get_data(self):
for i, link in enumerate(self.links):
self.get_url(link)
print('-' * 4 + self.names[i] + '-' * 4)
info_names = [div.text.strip() for div in self.soup.find_all(
'div', class_='recipe-meta-item-header')]
ingredient_spans = self.soup.find_all('span', class_='ingredients-item-name')
ingredients = [span.text.strip() for span in ingredient_spans]
for i, div in enumerate(self.soup.find_all('div', class_='recipe-meta-item-body')):
print(info_names[i].capitalize(), div.text.strip())
print()
print('Ingredients'.center(len(ingredients[0]), ' '))
print('\n'.join(ingredients))
print()
print('*' * 50, end='\n\n')
chrome = Scraper()
chrome.print_info(input('What recipe would you like to search: '))
focus_Search = raw_input("Focus Search ")
url = "https://www.google.com/search?q="
res = requests.get(url + focus_Search)
print("You Just Searched")
res_String = res.text
#Now I must get ALL the sections of code that start with "<a href" and end with "/a>"
Im trying to scrape all the links from a google search webpage. I could extract each link one at a time but I'm sure theres a better way to do it.
This creates a list of all links in the search page with some of your code, without getting into BeautifulSoup
import requests
import lxml.html
focus_Search = input("Focus Search ")
url = "https://www.google.com/search?q="
#focus_Search
res = requests.get(url + focus_Search).content
# res
dom = lxml.html.fromstring(res)
links = [x for x in dom.xpath('//a/#href')] # Borrows from cheekybastard in link below
# http://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup
links