from requests_html import HTMLSession
url = 'https://www.walmart.com/search?q=70+inch+tv&page=2&affinityOverride=default'
s = HTMLSession()
r = s.get(url)
r.html.render(sleep=1,timeout=20)
product = r.html.find('div.mb1.ph1.pa0-xl.bb.b--near-white.w-25')
productinfo = []
for item in product.absolute_links:
# ra = s.get(item)
# name = ra.html.find('h1',first=True).text
products = {
'link' :item,
}
productinfo.append(products)
print(productinfo)
print(len(productinfo))
Output
for item in product.absolute_links:
AttributeError: 'list' object has no attribute 'absolute_links'
I want to get link of every product than scrape some data from this website by requests-html library , but i'm getting attribute error.please help me.chack the website html
But can I solve captcha and logging via requests-html library? I'm not super familiar with requests-html library
Neither am I, but you can paste the request from your browser to https://curlconverter.com/ (they also have instructions on how to copy the request) and they'll convert it to python code for a request with headers and cookies that you can then paste into your code. The last line of their code will be response = requests.get(....., but you can replace it with r = s.get(... so that your code can still use requests_html methods like .html.render and .absolute_links (requests doesn't parse the HTML).
Just keep in mind that the cookies will expire, likely within a few hours, and that you'll have to copy them from your browser again by then if you want to keep scraping this way.
for item in product.absolute_links:
AttributeError: 'list' object has no attribute 'absolute_links'
You can only apply .absolute_links to an element and .find returns a list of elements (unless you specify first=True). Also .absolute_links returns a set of links [even when that set only contains one link], so you need to either loop through them or convert to list and access it through indexing to get at the link/s.
product = r.html.find('div.mb1.ph1.pa0-xl.bb.b--near-white.w-25')
productinfo = []
for prod in product:
item = prod.absolute_links # get product link/s
# ra = s.get(list(item)[0]) # go to first product link
# name = ra.html.find('h1',first=True).text
products = {'link' :item, }
productinfo.append(products)
or, to absolutely ensure that you're looping through a unique urls,
product = r.html.find('div.mb1.ph1.pa0-xl.bb.b--near-white.w-25')
prodUrls = set().union(*[d.absolute_links for d in product]) # combine all sets of product links
productinfo = []
for item in prodUrls:
# ra = s.get(item)
# name = ra.html.find('h1',first=True).text
products = {'link' :item, }
productinfo.append(products)
Btw, if it doesn't find any products, then ofc you won't get any links even if the error goes away, so add a line to print the request status (in case something went wrong there) as well as how many products and links were extracted.
print(r.status_code, r.reason, f' - {len(product)} products and {len(prodUrls)} product links from', r.url)
Related
I've been trying to fetch all US Zipcodes for a web scraping project for my company.
I'm trying to use uszipcode library for doing it automatically rather than manually from the website im intersted in but cant figure it out.
this is my manual attempt:
from bs4 import BeautifulSoup
import requests
url = 'https://www.unitedstateszipcodes.org'
headers = {'User-Agent': 'Chrome/50.0.2661.102'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
hrefs = []
all_zipcodes = []
# Extract all
for data in soup.find_all('div', class_='state-list'):
for a in data.find_all('a'):
if a is not None:
hrefs.append(a.get('href'))
hrefs.remove(None)
def get_zipcode_list():
"""
get_zipcode_list gets the GET response from the web archives server using CDX API
:return: CDX API output in json format.
"""
for state in hrefs:
state_url = url + state
state_page = requests.get(state_url, headers=headers)
states_soup = BeautifulSoup(state_page.text, 'html.parser')
div = states_soup.find(class_='list-group')
for a in div.findAll('a'):
if str(a.string).isdigit():
all_zipcodes.append(a.string)
return all_zipcodes
This takes alot of time and would like to know how to do the same in more efficient way using uszipcodes
You may try to search by pattern ''
s = SearchEngine()
l = s.by_pattern('', returns=1000000)
print(len(l))
More details in docs and in their basic tutorial
engine = SearchEngine()
allzips = {}
for i in range(100000): #Get zipcode info for every possible 5-digit combination
zipcode = str(i).zfill(5)
try: allzips[zipcode] = engine.by_zipcode(zipcode).to_dict()
except: pass
#Convert dictionary to DataFrame
allzips = pd.DataFrame(allzips).T.reset_index(drop = True)
Since zip codes are only 5-digits, you can iterate up to 100k and see which zip codes don't return an error. This solution gives you a DataFrame with all the stored information for each saved zip code
The regex that zip code in US have is [0-9]{5}(?:-[0-9]{4})?
you can simply check with re module
import re
regex = r"[0-9]{5}(?:-[0-9]{4})?"
if re.match(zipcode, regex):
print("match")
else:
print("not a match")
You can download the list of zip codes from the official source) and then parse it if it's for one-time use and you don't need any other metadata associated with each of the zip codes like the one which uszipcodes provides.
The uszipcodes also has another database which is quite big and should have all the data you need.
from uszipcode import SearchEngine
zipSearch = SearchEngine(simple_zipcode=False)
allZipCodes = zipSearch.by_pattern('', returns=200000)
print(len(allZipCodes)
I would like to extract multiple drug informations from multiple pages in https://www.medindia.net/doctors/drug_information/abacavir.htm,
https://www.medindia.net/doctors/drug_information/talimogene_laherparepvec.htm,
and etc
On each pages, The information that I would like to extract are as follows: General, Brands, Prescription Contraindications, Side effects, Dosage, How to Take, Warning and Storage.
By using Beautiful soup, I am able to identify the class needed for extraction. However, when i am trying to extract the information and store the information into a variable, it shows the 'NoneType' object has no attribute 'get_text' . It seems that there is no element with the class 'drug-content'. However, when I print the items it shows the class. Please help me. Below is my code:
import pandas as pd
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
url = 'https://www.medindia.net/doctors/drug_information/abacavir.htm'
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
drug = soup.find(class_='mi-container__fluid')
print(drug)
# whole page contain drug content
items = drug.find_all(class_='drug-content')
print(items)
# extract drug information from drug content into individual variable
general = items[0].find(class_='drug-content').get_text(strip=True).replace("\n", "")
brand = items[1].find(class_='report-content').get_text(strip=True).replace("\n", "")
prescription = items[1].find(class_='drug-content').get_text(strip=True).replace("\n", "")
contraindications = items[2].find(class_='drug-content').get_text(strip=True).replace("\n", "")
side_effect = items[2].find(class_='drug-content').get_text(strip=True).replace("\n", "")
dosage = items[3].find(class_='drug-content').get_text(strip=True).replace("\n", "")
how_to_use = items[4].find(class_='drug-content').get_text(strip=True).replace("\n", "")
warnings = items[5].find(class_='drug-content').get_text(strip=True).replace("\n", "")
storage = items[7].find(class_='drug-content').get_text(strip=True).replace("\n", "")
I have try to change the class to 'report-content drug-widget'. However, with that class, I am unable to extract the general information. And also side-effect is unavailable for this drug. How can I put an NA into the variable if the information is not available for the drug.
# whole page contain drug content
items = drug.find_all(class_='report-content drug-widget')
print(items)
# extract drug information from drug content into individual variable
general = items.find(class_='drug-content').get_text(strip=True).replace("\n", "")
brand = items[0].find(class_='drug-content').get_text(strip=True).replace("\n", "")
Please advice how to extract the information and how can I put NA where information which I need are not available.
I can help you with the first one, it should help you get started on how to deal with non finds, and how to search for the pattern your looking for:
try:
general = items[0].find('h3', attrs={'style': 'margin:0px!important'}).get_text(strip=True).replace("\n", "").replace("\xa0", " ")
except:
general = "N/A"
You can slice the Generic Name: out since it's probably the same size for each answer by:
general = general[15:]
print(general):
#'Abacavir'
I am trying to create my first Python web-scraper to automate one task for work - I need to write all vacancies from this website (only for health) to an Excel file. Using a tutorial, I have come up with the following program.
However, in step 6, I receive an error stating: IndexError: list index out of range.
I have tried using start_page = paging[2].text, as I thought that the first page may be the base page, but it results in the same error.
Here are the steps that I followed:
I checked that the website https://iworkfor.nsw.gov.au allows scraping
Imported the necessary libraries:
import requests
from bs4 import BeautifulSoup
import pandas
stored the URL as a variable:
base_url = "https://iworkfor.nsw.gov.au/nsw-health-jobs?divisionid=1"
Get the HTML content:
r = requests.get(base_url)`
c = r.content
parse HTML
soup = BeautifulSoup(c,"html.parser")
To extract the first and last page numbers
paging = soup.find("div",{"class":"pana jobResultPaging tab-paging-top"}).find_all("a")
start_page = paging[1].text
last_page = paging[len(paging)-2].text
Making an empty list to append all the content:
web_content_list = []
Making page links from the page numbers ,crawl through the pages and extract the contents from the corresponding tags
for page_number in range(int(start_page),int(last_page) + 1):
# To form the url based on page numbers
url = base_url+"&page="+str(page_number)
r = requests.get(base_url+"&page="+str(page_number))
c = r.content
soup = BeautifulSoup(c,"html.parser")
To extract the Title
vacancies_header = soup.find_all("div", {"class":"box-sec2-left"})
To extract the LHD, Job type and Job Reference number
vacancies_content = soup.find_all("div", {"class":"box-sec2-right"})
To process vacancy by vacancy by looping
for item_header,item_content in zip(vacancies_header,vacancies_content):
# To store the information to a dictionary
web_content_dict = {}
web_content_dict["Title"]=item_header.find("a").text.replace("\r","").replace("\n","")
web_content_dict["Date Posted"] = item_header.find("span").text
web_content_dict["LHD"] = item_content.find("h5").text
web_content_dict["Position Type"] = item_content.find("p").text
web_content_dict["Job Reference Number"] = item_content.find("span",{"class":"box-sec2-reference"}).text
# To store the dictionary to into a list
web_content_list.append(web_content_dict)
To make a dataframe with the list
df = pandas.DataFrame(web_content_list)
To write the dataframe to a csv file
df.to_csv("Output.csv")
Ideally, the program will write the data about all vacancies to a CSV file in a nice table with the columns: title, date posted, LHD, Position Type, Job reference number.
The problem is that your initial call to find() returns an empty <div>, and so your subsequent call to find_all returns an empty list:
>div = soup.find("div",{"class":"pana jobResultPaging tab-paging-top"
>div
<div class="pana jobResultPaging tab-paging-top">
</div>
>div.find_all("a")
[]
Update:
The reason you're unable to parse the contents of the <div> in question (i.e. why it's empty) has to do with the fact that the data retrieved from the server is "paginated" by client-side javascript (code in your browser). Your python code is parsing only the HTML that is returned by the request to iworkfor.nsw.gov.au; the data that is what you're after (and what is turned into "pages") is requested by that same javascript and returned by the server in a format called JSON.
So, the bad news is that the instructions that have been provided to you will not work. You will have to parse the JSON returned by the server and then decode the escaped HTML that it contains.
I am trying to get the names of members of a group I am a member of. I am able to get the names in the first page but not sure how to go to the next page:
My Code:
url = 'https://graph.facebook.com/v2.5/1671554786408615/members?access_token=<MY_CUSTOM_ACCESS_CODE_HERE>'
json_obj = urllib2.urlopen(url)
data = json.load(json_obj)
for each in data['data']:
print each['name']
Using the code above I am successfully getting all names on the first page but question is -- how do I go to the next page?
In the Graph API Explorer Output screen I see this:
What change does my code need to keep going to next pages and get names of ALL members of the group?
The JSON returned by the Graph API is telling you where to get the next page of data, in data['paging']['next']. You could give something like this a try:
def printNames():
json_obj = urllib2.urlopen(url)
data = json.load(json_obj)
for each in data['data']:
print each['name']
return data['paging']['next'] # Return the URL to the next page of data
url = 'https://graph.facebook.com/v2.5/1671554786408615/members?access_token=<MY_CUSTOM_ACCESS_CODE_HERE>'
url = printNames()
print "====END OF PAGE 1===="
url = printNames()
print "====END OF PAGE 2===="
You would need to add checks, for instance ['paging']['next'] will only be available in your JSON object if there is a next page, so you might want to modify your function to return a more complex structure to convey this information, but this should give you the idea.
I am attempting to create a bot that fetches market links from steam but have run into a problem. I was able to return all the data from a single page, but when I attempt to get multiple pages it just gives me copies of the first page though I give it working links (eg: http://steamcommunity.com/market/search?q=appid%3A753#p1 and then http://steamcommunity.com/market/search?q=appid%3A753#p2). I have tested the links and they work in my browser. This is my code.
import urllib2
import random
import time
start_url = "http://steamcommunity.com/market/search?q=appid%3A753"
end_page = 3
urls = []
def get_raw(url):
req = urllib2.Request(url)
response = urllib2.urlopen(req)
return response.read()
def get_market_urls(html):
index = 0
while index != -1:
index = html.find("market_listing_row_link", index+25)
beg = html.find("http", index)
end = html.find('"',beg)
print html[beg:end]
urls.append(html[beg:end])
def go_to_page(page):
return start_url+"#p"+str(page)
def wait(min, max):
wait_t = random.randint(min,max)
time.sleep(wait_t)
for i in range(end_page):
url = go_to_page(i+1)
raw = get_raw(url)
get_market_urls(raw)
Your problem is that you've misunderstood what the URL says.
The number after the hashtag doesn't mean it's a different URL that can be fetched. This is called the query string. In that particular page the query string explains to the javascript which page to pull off AJAX. (Read about it Here and Here if you're interested..).
Anyway, you shoul look at the url: http://steamcommunity.com/market/search/render/?query=appid%3A753&start=00&count=10. You can play with the start=00&count=10 parameters to get the results you want.
Enjoy.