I've been working on a process to scrape the Game Result column from all of the links listed (each one is unique with random ID #'s).
This script is not running? It's fine with less links. I can't seem to get the result for the full list - my backup plan would be to just run it in bunches, but not really ideal?
I was thinking I could potentially pull out the ids in a list, and then loop through them (so it's only one URL), but I wasn't sure if this would make a difference? I also wasn't sure how to execute and didn't want to spend time figuring it out if not worth it.
from bs4 import BeautifulSoup
import requests
profiles = []
urls = ['https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=2&stats_player_seq=-100',
for url in urls:
req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profile in soup.select('table a[href^="/game/"]'):
profile = 'https://stats.ncaa.org'+profile.get('href')
I use CSS selector and select .smtext > a[href^="/game/"] and it's working. As #OneCricketeer has stated, it takes load time.So I also use a bit load time.
from bs4 import BeautifulSoup
import requests
import time
profiles = []
urls = ['https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=2&stats_player_seq=-100',
for url in urls:
req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profile in soup.select('.smtext > a[href^="/game/"]'):
profile = 'https://stats.ncaa.org'+profile.get('href')
['https://stats.ncaa.org/game/index/5155492?org_id=2', 'https://stats.ncaa.org/game/index/5157510?org_id=2', 'https://stats.ncaa.org/game/index/5161595?org_id=2', 'https://stats.ncaa.org/game/index/5168727?org_id=2', 'https://stats.ncaa.org/game/index/5169809?org_id=2', 'https://stats.ncaa.org/game/index/5170947?org_id=2', 'https://stats.ncaa.org/game/index/5172890?org_id=2', 'https://stats.ncaa.org/game/index/5176186?org_id=2', 'https://stats.ncaa.org/game/index/5177526?org_id=2', 'https://stats.ncaa.org/game/index/5178745?org_id=2', 'https://stats.ncaa.org/game/index/5179659?org_id=2', 'https://stats.ncaa.org/game/index/5180379?org_id=2', 'https://stats.ncaa.org/game/index/5180862?org_id=2', 'https://stats.ncaa.org/game/index/5182230?org_id=2', 'https://stats.ncaa.org/game/index/5183097?org_id=2', 'https://stats.ncaa.org/game/index/5184752?org_id=2', 'https://stats.ncaa.org/game/index/5185968?org_id=2', 'https://stats.ncaa.org/game/index/5187936?org_id=2', 'https://stats.ncaa.org/game/index/5189299?org_id=2', 'https://stats.ncaa.org/game/index/5191229?org_id=2', 'https://stats.ncaa.org/game/index/5193013?org_id=2', 'https://stats.ncaa.org/game/index/5194803?org_id=2', 'https://stats.ncaa.org/game/index/5197087?org_id=2', 'https://stats.ncaa.org/game/index/5203612?org_id=2', 'https://stats.ncaa.org/game/index/5207012?org_id=2', 'https://stats.ncaa.org/game/index/5214481?org_id=2', 'https://stats.ncaa.org/game/index/5216857?org_id=2', 'https://stats.ncaa.org/game/index/5222370?org_id=2', 'https://stats.ncaa.org/game/index/5225017?org_id=2', 'https://stats.ncaa.org/game/index/5229763?org_id=2', 'https://stats.ncaa.org/game/index/5230552?org_id=2', '
I'm trying to extract the information from a "script" tag, the code is as follows
response = requests.get("https://www.zalando.es/jordan-air-jordan-mid-zapatillas-altas-blackdark-beetrootwhitehyper-royal-joc11a024-g11.html?hl=1610800800024", headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
marca = soup.find("h3", {"class":"OEhtt9 ka2E9k uMhVZi uc9Eq5 pVrzNP _5Yd-hZ"}).text
nombre = soup.find("h1", {"class":"OEhtt9 ka2E9k uMhVZi z-oVg8 pVrzNP w5w9i_ _1PY7tW _9YcI4f"}).text
color = soup.find("span", {"class":"u-6V88 ka2E9k uMhVZi dgII7d z-oVg8 pVrzNP"}).text
precio = soup.find("span", {"class":"uqkIZw ka2E9k uMhVZi FxZV-M z-oVg8 pVrzNP"}).text
talla = soup.find("span", {"class":"u-6V88 ka2E9k uMhVZi FxZV-M z-oVg8 pVrzNP"}).text
imagen = soup.find("img", {"class": "_6uf91T z-oVg8 u-6V88 ka2E9k uMhVZi FxZV-M _2Pvyxl JT3_zV EKabf7 mo6ZnF _1RurXL mo6ZnF PZ5eVw"})['src']
sku355 = api + str(soup.find_all('script')[15]).split('sku":"')[3][:-137]
sku36 = api + str(soup.find_all('script')[15]).split('sku":"')[4][:-139]
sku365 = api + str(soup.find_all('script')[15]).split('sku":"')[5][:-139]
sku375 = api + str(soup.find_all('script')[15]).split('sku":"')[6][:-137]
sku38 = api + str(soup.find_all('script')[15]).split('sku":"')[7][:-139]
sku385 = api + str(soup.find_all('script')[15]).split('sku":"')[8][:-137]
sku39 = api + str(soup.find_all('script')[15]).split('sku":"')[9][:-137]
sku40 = api + str(soup.find_all('script')[15]).split('sku":"')[10][:-139]
sku405 = api + str(soup.find_all('script')[15]).split('sku":"')[11][:-137]
sku41 = api + str(soup.find_all('script')[15]).split('sku":"')[12][:-137]
sku42 = api + str(soup.find_all('script')[15]).split('sku":"')[13][:-139]
sku425 = api + str(soup.find_all('script')[15]).split('sku":"')[14][:-137]
sku43 = api + str(soup.find_all('script')[15]).split('sku":"')[15][:-125]
print (sku3555)
print (sku36)
print (sku365)
print (sku375)
print (sku38)
print (sku385)
print (sku39)
print (sku40)
print (sku405)
print (sku41)
print (sku42)
print (sku425)
print (sku43)
Everything works perfect with these shoes, but when I switch for example to this link it gives me something else, what I would like to take out is the SKU of each size, regardless of the link that puts
Could not reproduce your example, would be cool to improve your question.
Just in case
If you just wanna grab the sizes, try the following:
import requests, json
from bs4 import BeautifulSoup
headers = {"user-agent": "Mozilla/5.0"}
response = requests.get("https://www.zalando.es/jordan-air-jordan-mid-zapatillas-altas-blackdark-beetrootwhitehyper-royal-joc11a024-g11.html?hl=1610800800024", headers=headers)
soup = BeautifulSoup(response.content, 'lxml')
json_object = json.loads(soup.select_one('script#z-vegas-pdp-props').contents[0].split('CDATA')[1].split(']>')[0])
for item in json_object[0]['model']['articleInfo']['units']:
print('sku:{0} - size:{1}'.format(item['id'],item['size']['local']))
sku:JOC11A024-G110005000 - size:35.5
sku:JOC11A024-G110055000 - size:36
sku:JOC11A024-G110006000 - size:36.5
sku:JOC11A024-G110065000 - size:37.5
sku:JOC11A024-G110007000 - size:38
sku:JOC11A024-G110075000 - size:38.5
sku:JOC11A024-G110008000 - size:39
sku:JOC11A024-G110085000 - size:40
sku:JOC11A024-G110009000 - size:40.5
sku:JOC11A024-G110095000 - size:41
sku:JOC11A024-G110010000 - size:42
sku:JOC11A024-G110105000 - size:42.5
sku:JOC11A024-G110011000 - size:43
I want to scrape a certain website weather data but the default page layout gives max of 40 results but when layout changed to simple list gives 100 results and the layout is set to default which is difficult to achieve with selenium. Is there any way to get the cookies saved in chrome to be used with beautiful soup.
import requests
from bs4 import BeautifulSoup
import browser_cookie3
cj = browser_cookie3.load()
s = requests.Session()
url = "https:/something.org/titles/2"
for c in cj:
if 'mangadex' in str(c):
r = s.get(url)
soup = BeautifulSoup(r.content, 'lxml')
for anime in soup.find_all('div', {'class': 'manga-entry col-lg-6 border-bottom pl-0 my-1'}):
det = anime.find('a', {"class": "ml-1 manga_title text-truncate"})
anime_name = det.text
anime_link = det['href']
stars = anime.select("span")[3].text
print(anime_name, anime_link, stars,i)
import browser_cookie3
import requests
cj = browser_cookie3.load()
s = requests.Session()
for c in cj:
if 'sitename' in str(c):
r = s.get(the_site)
This code use the browsers cookies in the requests module in as Session. Simply change sitename to the site you want cookies from.
Your new code:
import requests
from bs4 import BeautifulSoup
import browser_cookie3
cj = browser_cookie3.load()
s = requests.Session()
url = "https://something.org/titles/2"
i = 1
for c in cj:
if 'mangadex' in str(c):
r = s.get(url)
soup = BeautifulSoup(r.content, 'lxml')
for anime in soup.find_all('div', {'class': 'manga-entry row m-0 border-bottom'}):
det = anime.find('a', {"class": "ml-1 manga_title text-truncate"})
anime_name = det.text
anime_link = det['href']
stars = anime.select("span")[3].text
print(anime_name, anime_link, stars, i)
i = i + 1
-Hitogatana- /title/540/hitogatana 4 1
-PIQUANT- /title/44134/piquant 5 2
-Rain- /title/37103/rain 4 3
-SINS- /title/1098/sins 4
:radical /title/46819/radical 1 5
:REverSAL /title/3877/reversal 3 6
... /title/52206/ 7
...Curtain. ~Sensei to Kiyoraka ni Dousei~ /title/7829/curtain-sensei-to-kiyoraka-ni-dousei 8
...Junai no Seinen /title/28947/junai-no-seinen 9
...no Onna /title/10162/no-onna 2 10
...Seishunchuu! /title/19186/seishunchuu 11
...Virgin Love /title/28945/virgin-love 12
.flow - Untitled (Doujinshi) /title/27292/flow-untitled-doujinshi 2 13
.gohan /title/50410/gohan 14
.hack//4koma + Gag Senshuken /title/7750/hack-4koma-gag-senshuken 24 15
.hack//Alcor - Hagun no Jokyoku /title/24375/hack-alcor-hagun-no-jokyoku 16
.hack//G.U.+ /title/7757/hack-g-u 1 17
.hack//GnU /title/7758/hack-gnu 18
.hack//Link - Tasogare no Kishidan /title/24374/hack-link-tasogare-no-kishidan 1 19
.hack//Tasogare no Udewa Densetsu /title/5817/hack-tasogare-no-udewa-densetsu 20
.hack//XXXX /title/7759/hack-xxxx 21
.traeH /title/9789/traeh 22
(G) Edition /title/886/g-edition 1 23
(Not) a Househusband /title/22832/not-a-househusband 6 24
(R)estauraNTR /title/37551/r-estaurantr 14 25
[ rain ] 1st Story /title/25587/rain-1st-story 3 26
[another] Xak /title/24881/another-xak 27
[es] ~Eternal Sisters~ /title/4879/es-eternal-sisters 1 28
and so on to 100...
Program suppose to return values for all 50 movies for its title, Metascore, genre, gross and if not available return aa none to ensure all elements in the respective list are 50 but currently give out 43 elements.
url = requests.get(f'https://www.imdb.com/search/title/?title_type=feature&year=2017-01-01,2017-12-31&start=51&ref_=adv_nxt')
soup = BeautifulSoup(url.text, 'html.parser')
for t, m, g, r, c, i in zip(soup.select('div.lister-list >div.lister-item>div.lister-item-content>h3.lister-item-header>a'),
soup.select('div.lister-list >div.lister-item>div.lister-item-content>div.ratings-bar>div.ratings-metascore>span'),
soup.select('div.lister-list >div.lister-item>div.lister-item-content>p.text-muted>.genre'),
soup.select('div.lister-list >div.lister-item>div.lister-item-content>p.text-muted>.runtime'),
soup.select('div.lister-list >div.lister-item>div.lister-item-content>p.text-muted>.certificate'),
soup.select('div.lister-list >div.lister-item>div.lister-item-content>div.ratings-bar>div>strong')):
For loops return None value to values not present
for v in soup.select('div.lister-item-content >p.sort-num_votes-visible'):
votes.append(v.find('span', attrs = {'name':'nv'}).text)
vote = v.find_all('span', attrs={'name': 'nv'})
except IndexError:
Some movies don't have metascore and some of them don't have certificate either. You either go for try-except blocks or conditional statements to get rid of that error. I used the latter within the following example. Give it a shot:
import requests
from bs4 import BeautifulSoup
link = 'https://www.imdb.com/search/title/?title_type=feature&year=2017-01-01,2017-12-31&start=51&ref_=adv_nxt'
res = requests.get(link)
soup = BeautifulSoup(res.text, 'html.parser')
for item in soup.select(".lister-item"):
name = item.select_one('h3.lister-item-header > a').get_text(strip=True)
score = item.select_one('span.metascore').get_text(strip=True) if item.select_one('span.metascore') else None
genre = item.select_one('span.genre').get_text(strip=True) if item.select_one('span.genre') else None
runtime = item.select_one('span.runtime').get_text(strip=True) if item.select_one('span.runtime') else None
certificate = item.select_one('span.certificate').get_text(strip=True) if item.select_one('span.certificate') else None
rating = item.select_one('.rating-star + strong').get_text(strip=True) if item.select_one('.rating-star + strong') else None
I have written the following function that scrapes multiple pages from a website. I only want to get the first 20 or so pages. How can I limit the number of rows that I fill in my dataframe:
def scrape_page(poi,page_name):
for link in experiences.findAll('a', attrs={'href': re.compile(page_name+".shtml$")}):
url=urljoin(base_url, link.get("href"))
expages=BeautifulSoup(subpage, "html.parser")
for report in expages.findAll('a', attrs={'href': re.compile("^/experiences/exp")}):
url=urljoin(base_url, report.get("href"))
reporturl=BeautifulSoup(reporturlopen, "html.parser")
book_title= reporturl.findAll("div",attrs={'class':'title'})
for i in book_title:
book_genre= reporturl.findAll("div",attrs={'class':'genre'})
for i in book_genre:
book_author= reporturl.findAll("div",attrs={'class':'author'})
for i in book_author:
author = re.sub("by", "",author)
setattr(sys.modules[__name__], '{}_df'.format(poi+"_"+page_name), empty_list)
You can for example add a while loop:
i = 0
while i < 20:
< insert your code >
i += 1
I want to scrape all the subcategories and pages under the category header of the Category page: "Category:Computer science". The link for the same is as follows: http://en.wikipedia.org/wiki/Category:Computer_science.
I have got an idea regarding the above mentioned problem, from the following stack overflow answer which is specified in the following link.
Pythonic beautifulSoup4 : How to get remaining titles from the next page link of a wikipedia category
How to scrape Subcategories and pages in categories of a Category wikipedia page using Python
However, the answer do not fully solves the problem. It only scrapes the Pages in category "Computer science". But, I want to extract all the subcategories names and its associated pages. I want the process should report the results in BFS manner with a depth of 10. Is there exist any way to do this?
I found the following code from this linked post:
from pprint import pprint
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
base_url = 'https://en.wikipedia.org/wiki/Category:Computer science'
def get_next_link(soup):
return soup.find("a", text="next page")
def extract_links(soup):
return [a['title'] for a in soup.select("#mw-pages li a")]
with requests.Session() as session:
content = session.get(base_url).content
soup = BeautifulSoup(content, 'lxml')
links = extract_links(soup)
next_link = get_next_link(soup)
while next_link is not None: # while there is a Next Page link
url = urljoin(base_url, next_link['href'])
content = session.get(url).content
soup = BeautifulSoup(content, 'lxml')
links += extract_links(soup)
next_link = get_next_link(soup)
To scrape the subcategories, you will have to use selenium to interact with the dropdowns. A simple traversal over the second category of links will yield the pages, however, to find all the subcategories, recursion is needed to properly group the data. The code below utilizes a simple variant of the breadth-first search to determine when to stop looping over the dropdown toggle objects generated at each iteration of the while loop:
from selenium import webdriver
import time
from bs4 import BeautifulSoup as soup
def block_data(_d):
return {_d.find('h3').text:[[i.a.attrs.get('title'), i.a.attrs.get('href')] for i in _d.find('ul').find_all('li')]}
def get_pages(source:str) -> dict:
return [block_data(i) for i in soup(source, 'html.parser').find('div', {'id':'mw-pages'}).find_all('div', {'class':'mw-category-group'})]
d = webdriver.Chrome('/path/to/chromedriver')
all_pages = get_pages(d.page_source)
_seen_categories = []
def get_categories(source):
return [[i['href'], i.text] for i in soup(source, 'html.parser').find_all('a', {'class':'CategoryTreeLabel'})]
def total_depth(c):
return sum(1 if len(b) ==1 and not b[0] else sum([total_depth(i) for i in b]) for a, b in c.items())
def group_categories(source) -> dict:
return {i.find('div', {'class':'CategoryTreeItem'}).a.text:(lambda x:None if not x else [group_categories(c) for c in x])(i.find_all('div', {'class':'CategoryTreeChildren'})) for i in source.find_all('div', {'class':'CategoryTreeSection'})}
while True:
full_dict = group_categories(soup(d.page_source, 'html.parser'))
flag = False
for i in d.find_elements_by_class_name('CategoryTreeToggle'):
if i.get_attribute('data-ct-title') not in _seen_categories:
flag = True
if not flag:
[{'\xa0': [['Computer science', '/wiki/Computer_science'], ['Glossary of computer science', '/wiki/Glossary_of_computer_science'], ['Outline of computer science', '/wiki/Outline_of_computer_science']]},
{'B': [['Patrick Baudisch', '/wiki/Patrick_Baudisch'], ['Boolean', '/wiki/Boolean'], ['Business software', '/wiki/Business_software']]},
{'C': [['Nigel A. L. Clarke', '/wiki/Nigel_A._L._Clarke'], ['CLEVER score', '/wiki/CLEVER_score'], ['Computational human modeling', '/wiki/Computational_human_modeling'], ['Computational social choice', '/wiki/Computational_social_choice'], ['Computer engineering', '/wiki/Computer_engineering'], ['Critical code studies', '/wiki/Critical_code_studies']]},
{'I': [['Information and computer science', '/wiki/Information_and_computer_science'], ['Instance selection', '/wiki/Instance_selection'], ['Internet Research (journal)', '/wiki/Internet_Research_(journal)']]},
{'J': [['Jaro–Winkler distance', '/wiki/Jaro%E2%80%93Winkler_distance'], ['User:JUehV/sandbox', '/wiki/User:JUehV/sandbox']]},
{'K': [['Krauss matching wildcards algorithm', '/wiki/Krauss_matching_wildcards_algorithm']]},
{'L': [['Lempel-Ziv complexity', '/wiki/Lempel-Ziv_complexity'], ['Literal (computer programming)', '/wiki/Literal_(computer_programming)']]},
{'M': [['Machine learning in bioinformatics', '/wiki/Machine_learning_in_bioinformatics'], ['Matching wildcards', '/wiki/Matching_wildcards'], ['Sidney Michaelson', '/wiki/Sidney_Michaelson']]},
{'N': [['Nuclear computation', '/wiki/Nuclear_computation']]}, {'O': [['OpenCV', '/wiki/OpenCV']]},
{'P': [['Philosophy of computer science', '/wiki/Philosophy_of_computer_science'], ['Prefetching', '/wiki/Prefetching'], ['Programmer', '/wiki/Programmer']]},
{'Q': [['Quaject', '/wiki/Quaject'], ['Quantum image processing', '/wiki/Quantum_image_processing']]},
{'R': [['Reduction Operator', '/wiki/Reduction_Operator']]}, {'S': [['Social cloud computing', '/wiki/Social_cloud_computing'], ['Software', '/wiki/Software'], ['Computer science in sport', '/wiki/Computer_science_in_sport'], ['Supnick matrix', '/wiki/Supnick_matrix'], ['Symbolic execution', '/wiki/Symbolic_execution']]},
{'T': [['Technology transfer in computer science', '/wiki/Technology_transfer_in_computer_science'], ['Trace Cache', '/wiki/Trace_Cache'], ['Transition (computer science)', '/wiki/Transition_(computer_science)']]},
{'V': [['Viola–Jones object detection framework', '/wiki/Viola%E2%80%93Jones_object_detection_framework'], ['Virtual environment', '/wiki/Virtual_environment'], ['Visual computing', '/wiki/Visual_computing']]},
{'W': [['Wiener connector', '/wiki/Wiener_connector']]},
{'Z': [['Wojciech Zaremba', '/wiki/Wojciech_Zaremba']]},
{'Ρ': [['Portal:Computer science', '/wiki/Portal:Computer_science']]}]
full_dict is quite large, and due to its size I am unable to post it entirely here, however, below is an implementation of a function to traverse the structure and select all the elements down to a depth of ten:
def trim_data(d, depth, count):
return {a:None if count == depth else [trim_data(i, depth, count+1) for i in b] for a, b in d.items()}
final_subcategories = trim_data(full_dict, 10, 0)
Edit: script to remove leaves from tree:
def remove_empty_children(d):
return {a:None if len(b) == 1 and not b[0] else
[remove_empty_children(i) for i in b if i] for a, b in d.items()}
When running the above:
c = {'Areas of computer science': [{'Algorithms and data structures': [{'Abstract data types': [{'Priority queues': [{'Heaps (data structures)': [{}]}, {}], 'Heaps (data structures)': [{}]}]}]}]}
d = remove_empty_children(c)
{'Areas of computer science': [{'Algorithms and data structures': [{'Abstract data types': [{'Priority queues': [{'Heaps (data structures)': None}], 'Heaps (data structures)': None}]}]}]}
Edit 2: flattening the entire structure:
def flatten_groups(d):
for a, b in d.items():
yield a
if b is not None:
for i in map(flatten_groups, b):
yield from i
['Areas of computer science', 'Algorithms and data structures', 'Abstract data types', 'Priority queues', 'Heaps (data structures)', 'Heaps (data structures)']
Edit 3:
To access all the pages for every subcategory to a certain level, the original get_pages function can be utilized and a slightly different version of the group_categories method
def _group_categories(source) -> dict:
return {i.find('div', {'class':'CategoryTreeItem'}).find('a')['href']:(lambda x:None if not x else [group_categories(c) for c in x])(i.find_all('div', {'class':'CategoryTreeChildren'})) for i in source.find_all('div', {'class':'CategoryTreeSection'})}
from collections import namedtuple
page = namedtuple('page', ['pages', 'children'])
def subcategory_pages(d, depth, current = 0):
r = {}
for a, b in d.items():
all_pages_listing = get_pages(requests.get(f'https://en.wikipedia.org{a}').text)
print(f'page number for {a}: {len(all_pages_listing)}')
r[a] = page(all_pages_listing, None if current==depth else [subcategory_pages(i, depth, current+1) for i in b])
return r
print(subcategory_pages(full_dict, 2))
Please note that in order to utilize subcategory_pages, _group_categories must be used in place of group_categories.