Hi I am trying to get the src data from the image on the website, I locate the image using the class since it is unique. With the code below it is able to locate the image but is unable to save the image to mongodb and shows up as null, so want to find the src and save the link instead.
ps. the code works for other classes but not sure how to locate the src and save it into "findImage".
https://myaeon2go.com/products/category/6236298/vegetable
postal code is : 56000
cate_list = [
"https://myaeon2go.com/products/category/1208101/fresh-foods",
"https://myaeon2go.com/products/category/8630656/ready-to-eat",
"https://myaeon2go.com/products/category/6528959/grocery",
"https://myaeon2go.com/products/category/6758871/snacks",
"https://myaeon2go.com/products/category/8124135/chill-&-frozen",
"https://myaeon2go.com/products/category/4995043/beverage",
"https://myaeon2go.com/products/category/3405538/household",
"https://myaeon2go.com/products/category/493239/baby-&-kids",
]
cookies = {
"hideLocationOverlay": "true",
"selectedShippingState": "Kuala Lumpur",
"selectedPostalCode": "56000",
}
for x in range(len(cate_list)):
url = cate_list[x]
# convert soup to readable html
result = requests.get(url, cookies=cookies)
doc = BeautifulSoup(result.text, "html.parser")
# a for loop located here to loop through all the products
# <span class="n_MyDQk4X3P0XRRoTnOe a8H5VCTgYjZnRCen1YkC">myAEON2go Signature Taman Maluri</span>
findImage = j.find("img", {"class": "pgJEkulRiYnxQNzO8njV shown"})
To extract the value of src attribute simply call .get('src') on your element.
Try to change your strategy selecting elements and avoid using classes that are often dynamically - I recommend to use more static identifier as well as HTML structure.
for url in cate_list:
result = requests.get(url, cookies=cookies,headers = {'User-Agent': 'Mozilla/5.0'})
doc = BeautifulSoup(result.text, "html.parser")
for e in doc.select('.g-product-list li'):
print(e.img.get('src'))
Note: Iterating your list do not need range(len()) construct
Example
import requests
from bs4 import BeautifulSoup
cate_list = [
"https://myaeon2go.com/products/category/1208101/fresh-foods",
"https://myaeon2go.com/products/category/8630656/ready-to-eat",
"https://myaeon2go.com/products/category/6528959/grocery",
"https://myaeon2go.com/products/category/6758871/snacks",
"https://myaeon2go.com/products/category/8124135/chill-&-frozen",
"https://myaeon2go.com/products/category/4995043/beverage",
"https://myaeon2go.com/products/category/3405538/household",
"https://myaeon2go.com/products/category/493239/baby-&-kids",
]
cookies = {
"hideLocationOverlay": "true",
"selectedShippingState": "Kuala Lumpur",
"selectedPostalCode": "56000",
}
for url in cate_list:
result = requests.get(url, cookies=cookies,headers = {'User-Agent': 'Mozilla/5.0'})
doc = BeautifulSoup(result.text, "html.parser")
for e in doc.select('.g-product-list li'):
print(e.img.get('src').split(')/')[-1])
Output
https://assets.myboxed.com.my/1659400060229.jpg
https://assets.myboxed.com.my/1662502067580.jpg
https://assets.myboxed.com.my/1658448744726.jpg
https://assets.myboxed.com.my/1627880003755.jpg
https://assets.myboxed.com.my/1662507451284.jpg
https://assets.myboxed.com.my/1662501936757.jpg
https://assets.myboxed.com.my/1659400602324.jpg
https://assets.myboxed.com.my/1627880346297.jpg
https://assets.myboxed.com.my/1662501743853.jpg
...
import requests
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed
cookies = {
"hideLocationOverlay": "true",
"selectedShippingState": "Kuala Lumpur",
"selectedPostalCode": "56000",
}
links = [
"8630656/ready-to-eat",
"1208101/fresh-foods",
"6528959/grocery",
"6758871/snacks",
"8124135/chill-&-frozen",
"4995043/beverage",
"3405538/household",
"493239/baby-&-kids",
]
allin = []
def get_soup(content):
return BeautifulSoup(content, 'lxml', parse_only=SoupStrainer('img', class_="pgJEkulRiYnxQNzO8njV"))
def worker(req, url, link):
r = req.get(url + link)
soup = get_soup(r.content)
return [urljoin(url, x['src']) for x in soup.select('img')]
def main(url):
with requests.Session() as req, ThreadPoolExecutor(max_workers=10) as executor:
req.cookies.update(cookies)
fs = (executor.submit(worker, req, url, link) for link in links)
for f in as_completed(fs):
allin.extend(f.result())
print(allin)
if __name__ == "__main__":
main('https://myaeon2go.com/products/category/')
Related
I am new to programming and i am trying to parse this page: https://ruz.spbstu.ru/faculty/100/groups
url = "https://ruz.spbstu.ru/faculty/100/groups"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
scripts = soup.find_all('script')
print(scripts[3].text)
this gives me
window.__INITIAL_STATE__ = {"faculties":{"isFetching":false,"data":null,"errors":null},"groups":{"isFetching":false,"data":{"100":[{"id":35754,"name":"3733806/00301","level":3,"type":"common","kind":0,"spec":"38.03.06 Торговое дело","year":2022},{"id":35715,"name":"3763801/10103","level":2,"type":"common","kind":3,"spec":"38.06.01 Экономика","year":2022},{"id":34725,"name":"з3753801/80430_2021","level":5,"type":"distance","kind":2,"spec":"38.05.01 Экономическая безопасность","year":2022},{"id":33632,"name":"3733801/10002_2021","level":2,"type":"common","kind":0,"spec":"38.03.01 Экономика","year":2022}...........
contents are very long so this is an extract from the output.
i need get all 'id's and 'name's from this output and put them into the dictionary like {id:name}, i can't figure out a way how to do it.
Any information will be very helpful.
Try:
import re
import json
import requests
from bs4 import BeautifulSoup
url = "https://ruz.spbstu.ru/faculty/100/groups"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
scripts = soup.find_all("script")
data = re.search(r".*?({.*});", scripts[3].text).group(1)
data = json.loads(data)
out = {d["id"]: d["name"] for d in data["groups"]["data"]["100"]}
print(out)
Prints:
{35754: '3733806/00301', 35715: '3763801/10103', ...etc.
I am attempting to crawl the ncbi eutils webpage. I want to crawl the Id list from the web as shown in the below:
Here's the code for it:
import requests
from bs4 import BeautifulSoup
def get_html(url):
"""get the content of the url"""
response = requests.get(url)
response.encoding = 'utf-8'
return response.text
def get_pmid(html):
soup = BeautifulSoup(html, 'lxml')
for texts in soup.select('body'):
text = texts.get_text()
print text
url_ncbi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%22D-PANTOTHENIC+ACID%22&retmax=2000&usehistory=y&field=Title/Abstracts"
html = get_html(url_ncbi)
get_pmid(html)
I want to use the select function to acquire the text but cannot find the right code for the script: for texts in soup.select(' ').
I'm confused by the multiple layers of classes and ids from the web code like this:
For getting all ID tags you can use find_all() function:
import requests
from bs4 import BeautifulSoup
def get_html(url):
"""get the content of the url"""
response = requests.get(url)
response.encoding = 'utf-8'
return response.text
def get_pmid(html):
soup = BeautifulSoup(html, 'lxml')
rv = []
for id_tag in soup.find_all('id'):
rv.append(id_tag.text)
return rv
url_ncbi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%22D-PANTOTHENIC+ACID%22&retmax=2000&usehistory=y&field=Title/Abstracts"
html = get_html(url_ncbi)
all_ids = get_pmid(html)
print(all_ids)
Prints:
['29737393', '29209902', '24632028', '23727638', '22536244', '22052867', '15371742', '12204559', '10885798', '16348362', '3096335', '3734807', '6247641', '6997858', '761345', '108510', '355840', '1003285', '4676550', '5804470', '6076800', '6076775', '6012920', '14091285']
In my code, a user inputs a search term and the get_all_links parses the html response and extract the links that start with ‘http’. When req is replaced with a hard coded url such as:
content = urllib.request.urlopen("http://www.ox.ac.uk")
The program returns a list of properly formatted links correctly. However passing in req, no links are returned. I suspect this may be a formatting blip.
Here is my code:
import urllib.request
def get_all_links(s): # function to get all the links
d=0
links=[] # getting all links into a list
while d!=-1: # untill d is -1. i.e no links in that page
d=s.find('<a href=',d) # if <a href is found
start=s.find('"',d) # stsrt will be the next character
end=s.find('"',start+1) # end will be upto "
if d!=-1: # d is not -1
d+=1
if(s[start+1]=='h'): # add the link which starts with http only.
links.append(s[start+1:end]) # to link list
return links # return list
def main():
term = input('Enter a search term: ')
url = 'http://www.google.com/search'
value = {'q' : term}
user_agent = 'Mozilla/5.0'
headers = {'User-Agent' : user_agent}
data = urllib.parse.urlencode(value)
print(data)
url = url + '?' + data
print(url)
req = urllib.request.Request(url, None, headers)
content = urllib.request.urlopen(req)
s = content.read()
print(s)
links = get_all_links(s.decode('utf-8'))
for i in links: # print the returned list.
print(i)
main()
You should use a HTML parser, as suggested in the comments. A library like BeautifulSoup is perfect for this.
I have adapted your code to use BeautifulSoup
import urllib.request
from bs4 import BeautifulSoup
def get_all_links(s):
soup = BeautifulSoup(s, "html.parser")
return soup.select("a[href^=\"http\"]") # Select all anchor tags whose href attribute starts with 'http'
def main():
term = input('Enter a search term: ')
url = 'http://www.google.com/search'
value = {'q' : term}
user_agent = 'Mozilla/5.0'
headers = {'User-Agent' : user_agent}
data = urllib.parse.urlencode(value)
print(data)
url = url + '?' + data
print(url)
req = urllib.request.Request(url, None, headers)
content = urllib.request.urlopen(req)
s = content.read()
print(s)
links = get_all_links(s.decode('utf-8'))
for i in links: # print the returned list.
print(i)
main()
It uses the select method of the BeautifulSoup library and returns a list of selected elements (in your case anchor-tags).
Using a library like BeautifulSoup not only makes it easier, but you can also use much more complex selections. Imagine how you would have to change your code when you wanted to select all links whose href attribute contains the word "google" or "code"?
You can read the BeautifulSoup documentation here.
I'm scraping from two URLs that have the same DOM structure, and so I'm trying to find a way to scrape both of them at the same time.
The only caveat is that the data scraped from both these pages need to end up on distinctly named lists.
To explain with example, here is what I've tried:
import os
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.basketball-reference.com/leaders/ws_career.html',
'https://www.basketball-reference.com/leaders/ws_per_48_career.html',]
ws_list = []
ws48_list = []
categories = [ws_list, ws48_list]
for url in urls:
response = requests.get(url, headers=headers)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
for a in section.find_all('a'):
player_name = a.text
for cat_list in categories:
cat_list.append(player_name)
print(ws48_list)
print(ws_list)
This ends up printing two identical lists when I was shooting for 2 lists unique to its page.
How do I accomplish this? Would it be better practice to code it another way?
Instead of trying to append to already existing lists. Just create new ones. Make a function to do the scrape and pass each url in turn to it.
import os
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.basketball-reference.com/leaders/ws_career.html',
'https://www.basketball-reference.com/leaders/ws_per_48_career.html',]
def parse_page(url, headers={}):
response = requests.get(url, headers=headers)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
return [a.text for a in section.find_all('a')]
ws_list, ws48_list = [parse_page(url) for url in urls]
print('ws_list = %r' % ws_list)
print('ws8_list = %r' % ws48_list)
Just add them to the appropriate list and the problem is solved?
for i, url in enumerate(urls):
response = requests.get(url)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
for a in section.find_all('a'):
player_name = a.text
categories[i].append(player_name)
print(ws48_list)
print(ws_list)
You can use a function to define your scraping logic, then just call it for your urls.
import os
import requests
from bs4 import BeautifulSoup as bs
def scrape(url):
response = requests.get(url)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
names = []
for a in section.find_all('a'):
player_name = a.text
names.append(player_name)
return names
ws_list = scrape('https://www.basketball-reference.com/leaders/ws_career.html')
ws48_list = scrape('https://www.basketball-reference.com/leaders/ws_per_48_career.html')
print(ws_list)
print(ws48_list)
Here is the URL that I'am using:
http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i
In fact on this page, the link that I am looking for appears may be 5 second after loading the page.
I see after 5 second a post request to :
http://www.protect-stream.com/secur.php
with data like so :
k=2AE_a,LHmb6kSC_c,sZNk4eNixIiPo_c,_c,Gw4ERVdriKuHJlciB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WxBkmAtz75kpYcrHzxtYt32hCYSp0WjqOQR9bY_a,ofQtw_b,
I didn't get from where the 'k' value come from ?
Is their an idea on how we could get the 'k' value using python ?
This is not going to be trivial. The k parameter value is "hidden" deep inside a script element inside nested iframes. Here is a requests + BeautifulSoup way to get to the k value:
import re
from urlparse import urljoin
# Python 3: from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
base_url = "http://www.protect-stream.com"
with requests.Session() as session:
response = session.get("http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i")
# get the top frame url
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="frame.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the nested frame url
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="w.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the frame HTML source and extract the "k" value
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
script = soup.find("script", text=lambda text: text and "k=" in text).get_text(strip=True)
k_value = re.search(r'var k="(.*?)";', script).group(1)
print(k_value)
Prints:
YjfH9430zztSYgf7ItQJ4grv2cvH3mT7xGwv32rTy2HiB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WXhmwUC0ipkPRkLQepYHLyF1U0xvsrzHMcK2XBCeY3_a,O_b,