How can I crawl all the page?

How can I crawl all the page? - python

I am trying to crawl sites's text. But It's only crawling 12 articles.
I don't know why does it do like that. and I wondering If I wanna crawl other pages, What should I do?
import requests
from bs4 import BeautifulSoup
x = int(input("start page:"))
while x < int(input("end page:")):
x = x + 1
url = "https://www.mmtimes.com/national-news.html?page=" + str(x)
result = requests.get(url)
bs_obj = BeautifulSoup(result.content, "html.parser")
content = bs_obj.find("div", {"class": "msp-three-col"})
read_more = content.findAll("div", {"class": "read-more"})
for item in read_more:
atag = item.find('a')
link = "https://www.mmtimes.com" + atag["href"]
linkResult = requests.get(link)
subpage = BeautifulSoup(linkResult.content, "html.parser")
fnresult = subpage.find("div", {"class": "field-item even"})
print(fnresult.text)
print("Total "+str(len(read_more))+" articles"))

Check out the below code, I have made some changes. this will result the required output.
import requests
from bs4 import BeautifulSoup
x = int(input("start page:"))
y = input("end page:")
article_count = 0
while x <= int(y):
url = "https://www.mmtimes.com/national-news.html?page=" + str(x)
result = requests.get(url)
bs_obj = BeautifulSoup(result.content, "html.parser")
content = bs_obj.find("div", {"class": "msp-three-col"})
read_more = content.findAll("div", {"class": "read-more"})
for item in read_more:
atag = item.find('a')
link = "https://www.mmtimes.com" + atag["href"]
linkResult = requests.get(link)
subpage = BeautifulSoup(linkResult.content, "html.parser")
fnresult = subpage.find("div", {"class": "field-item even"})
print(fnresult.text)
article_count += len(read_more)
print("Total "+str(article_count)+" articles")
x += 1

Related

Getting a not subscriptable when running a web scraping script

I am practicing web scraping and am using this code. I am trying the for loop.
import requests
from bs4 import BeautifulSoup
name=[]
link=[]
address=[]
for i in range (1,11):
i=str(i)
url = "https://forum.iktva.sa/exhibitors-list?&page="+i+"&searchgroup=37D5A2A4-exhibitors"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for a in soup.select(".m-exhibitors-list__items__item__header__title__link"):
company_url = "https://forum.iktva.sa/" + a["href"].split("'")[1]
soup2 = BeautifulSoup(requests.get(company_url).content, "html.parser")
n=soup2.select_one(".m-exhibitor-entry__item__header__title").text
l=soup2.select_one("h4+a")["href"]
a=soup2.select_one(".m-exhibitor-entry__item__body__contacts__address").text
name.append(n)
link.append(l)
address.append(a)
When I am running the program I am getting this error:
l=soup2.select_one("h4+a")["href"]
TypeError: 'NoneType' object is not subscriptable
If i am not sure how to solve the problem.

You just need to raplace, follwing code to Handle None
l = soup2.select_one("h4+a")
if l:
l = l["href"]
else:
l = "Website not available"
As you can see, Because website is not available for:
https://forum.iktva.sa/exhibitors/sanad
OR you can handle all error like:
import requests
from bs4 import BeautifulSoup
def get_object(obj, attr=None):
try:
if attr:
return obj[attr]
else:
return obj.text
except:
return "Not available"
name = []
link = []
address = []
for i in range(1, 11):
i = str(i)
url = f"https://forum.iktva.sa/exhibitors-list?&page={i}&searchgroup=37D5A2A4-exhibitors"
soup = BeautifulSoup(requests.get(url).text, features="lxml")
for a in soup.select(".m-exhibitors-list__items__item__header__title__link"):
company_url = "https://forum.iktva.sa/" + a["href"].split("'")[1]
soup2 = BeautifulSoup(requests.get(company_url).content, "html.parser")
n = soup2.select_one(".m-exhibitor-entry__item__header__title").text
n = get_object(n)
l = soup2.select_one("h4+a")
l = get_object(l, 'href')
a = soup2.select_one(".m-exhibitor-entry__item__body__contacts__address")
a = get_object(a)
name.append(n)
link.append(l)
address.append(a)

Python: Web Scraping Weird Output

from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
import requests
url = 'https://en.wikisource.org/wiki/Main_Page'
r = requests.get(url)
Soup = BeautifulSoup(r.text, "html5lib")
List = Soup.find("div",class_="enws-mainpage-widget-content", id="enws-mainpage-newtexts-content").find_all('a')
ebooks=[]
i=0
for ebook in List:
x=ebook.get('title')
for ch in x:
if(ch==":"):
x=""
if x!="":
ebooks.append(x)
i=i+1
inputnumber=0
while inputnumber<len(ebooks):
print(inputnumber+1, " - ", ebooks[inputnumber])
inputnumber=inputnumber+1
input=int(input("Please select a book: "))
selectedbook = Soup.find("a", title=ebooks[input-1])
print(selectedbook['title'])
url1 = "https://en.wikisource.org/"+selectedbook['href']
print(url1)
r1 = requests.get(url1)
Soup1 = BeautifulSoup(r1.text, "html5lib")
List1 = Soup.find("div", class_="prp-pages-output")
print(List1)
This is my code. I want to get the paragraghs in the html code at the last part. But as output I get:
1 - The Center of the Web
2 - Bobby Bumps Starts a Lodge
3 - May (Mácha)
4 - Animal Life and the World of Nature/1903/06/Notes and Comments
5 - The Czechoslovak Review/Volume 2/No Compromise
6 - She's All the World to Me
7 - Their One Love
Please select a book: 4
Animal Life and the World of Nature/1903/06/Notes and Comments
https://en.wikisource.org//wiki/Animal_Life_and_the_World_of_Nature/1903/06/Notes_and_Comments
None
Why is the List1 returns as one? It shouldn't. Can someone tell me where I am doing wrong.

guess you just typo the Soup1 with Soup. + I think you would need more then only one when you are looking for list of items so I added the find_all() function.
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
import requests
url = "https://en.wikisource.org/wiki/Main_Page"
r = requests.get(url)
Soup = BeautifulSoup(r.text, "html5lib")
List = Soup.find(
"div", class_="enws-mainpage-widget-content", id="enws-mainpage-newtexts-content"
).find_all("a")
ebooks = []
i = 0
for ebook in List:
x = ebook.get("title")
for ch in x:
if ch == ":":
x = ""
if x != "":
ebooks.append(x)
i = i + 1
inputnumber = 0
while inputnumber < len(ebooks):
print(inputnumber + 1, " - ", ebooks[inputnumber])
inputnumber = inputnumber + 1
input = int(input("Please select a book: "))
selectedbook = Soup.find("a", title=ebooks[input - 1])
print(selectedbook["title"])
url1 = "https://en.wikisource.org/" + selectedbook["href"]
print(url1)
r1 = requests.get(url1)
Soup1 = BeautifulSoup(r1.text, "html5lib")
List1 = Soup1.find_all("div", class_="prp-pages-output")
print(List1)

Page loop not working on python webscrape

I'm quite new to python and have written a script using beautifulsoup to parse a website table. I've tried everything but can't get the loop to cycle through pages. It currently just repeats the data on the first page 8 times (number of pages).
Can anyone please help?
Code:
import requests
from bs4 import BeautifulSoup
first_year = 2020
last_year = 2020
for i in range(last_year-first_year+1):
year = str(first_year + i)
print("Running for year:", year)
text = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID="+year).text
soup = BeautifulSoup(text, "html.parser")
options = soup.findAll("option")
opts = []
for option in options:
if not option['value'].startswith("20") and not option['value'].startswith("19") and option["value"]:
opts.append({option["value"]: option.contents[0]})
for opt in opts:
for key, value in opt.items():
print("Doing option:", value)
text = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID=" + year + "&Round=" + key).text
pages_soup = BeautifulSoup(text, "html.parser")
p = pages_soup.findAll("a")
pages = 8
if "&Page=" in str(p[-2]):
pages = int(p[-2].contents[0])
for j in range(pages):
print("Page {}/{}".format(str(j+1), str(pages)))
parse = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID={}&Round={}&Page={}".format(year, key, j+1)).text
p_soup = BeautifulSoup(text, "html.parser")
tbody = pages_soup.findAll("tbody")
tbody_soup = BeautifulSoup(str(tbody), "html.parser")
tr = tbody_soup.findAll("tr")
for t in tr:
t = str(t).replace("</tr>", "").replace("<tr>", "").replace("amp;", "")
t = t[4:len(t)-5].split('</td><td>')
t.append(str(j+1))
t.append(str(value))
t.append(str(year))
open("output.csv", "a").write("\n" + ";".join(t))
Thankyou.

Try this..
for j in range(pages):
print("Page {}/{}".format(str(j+1), str(pages)))
parse = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID={}&Round={}&Page={}".format(year, key, j+1)).text
p_soup = BeautifulSoup(parse, "html.parser")
tbody = p_soup.findAll("tbody")
tbody_soup = BeautifulSoup(str(tbody), "html.parser")
tr = tbody_soup.findAll("tr")
for t in tr:
t = str(t).replace("</tr>", "").replace("<tr>", "").replace("amp;", "")
t = t[4:len(t)-5].split('</td><td>')
t.append(str(j+1))
t.append(str(value))
t.append(str(year))
open("output.csv", "a").write("\n" + ";".join(t))

How to update scraping link in for loop

I developed this program to scrape newegg for ps4 prices. However I want to scrape multiple pages. Here is what I have but once it scrapes the first page the program stops. Basically I am trying to change the link so 'pages-1' changes to 2,3,4 etc. Is there a better way to do this?
from bs4 import BeautifulSoup
import requests
import csv
page_num = 1
prod_num = 0
source = requests.get('https://www.newegg.com/PS4-Systems/SubCategory/ID-3102/Page-' + str(page_num) + '?PageSize=36&order=BESTMATCH').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('newegg_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Product', 'Price', 'Shipping_info'])
for info in soup.find_all('div', class_='item-container'):
prod = info.find('a', class_='item-title').text.strip()
price = info.find('li', class_='price-current').text.strip().splitlines()[1].replace(u'\xa0', '')
if u'$' not in price:
price = info.find('li', class_='price-current').text.strip().splitlines()[0].replace(u'\xa0', '')
ship = info.find('li', class_='price-ship').text.strip()
print(prod)
print(price)
print(ship)
csv_writer.writerow([prod, price, ship])
prod_num += 1
if prod_num > 35: #there is about 35 items per newegg page
page_num += 1
# print(price.splitlines()[1])
print('-----------')
csv_file.close()

i found the page limit num here
and i think you can get the page limit by xpath or other ways:
# xpath syntax may like this
# //span[#class='list-tool-pagination-text']
hope it's useful for you

If you noticed, Next "button" tag of last page has attribute "disabled", So [tag_name].has_attr('disabled') return True . Using this you can manage pagination.
import requests
from bs4 import BeautifulSoup
import csv
csv_file = open('newegg_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Product', 'Price', 'Shipping_info'])
URL_PART1 = "https://www.newegg.com/PS4-Systems/SubCategory/ID-3102/Page-"
URL_PART2 = "?PageSize=36&order=BESTMATCH"
PAGE_NO = 1
url = URL_PART1 + str(PAGE_NO) + URL_PART2
while len(url):
PAGE_NO+=1
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html.parser')
all_divs = soup.find_all('div', attrs={'class':'item-info'})
for item in all_divs:
prod = ""
price = ""
ship = ""
# get product name
prod = item.find('a', attrs={'class':'item-title'})
if prod:
prod = prod.text.strip()
# get price
price_part = item.find('li', attrs={'class':'price-current'})
if price_part:
price_part1 = price_part.strong
if price_part1:
price_part1 = price_part1.text.strip()
price_part2 = price_part.sup
if price_part2:
price_part2 = price_part2.text.strip()
if price_part1 and price_part2:
price = price_part1 + price_part2
# get shipping info
ship = item.find('li', attrs={'class':'price-ship'})
if ship:
ship = ship.text.strip()
csv_writer.writerow([prod, price, ship])
# manage pagination
next_button = soup.find('button', attrs={'title': 'Next'})
if not(next_button.has_attr('disabled')):
url = URL_PART1 + str(PAGE_NO) + URL_PART2
else:
url = ""

Scrape hotels from tripadvisor ..how to get hotels from all pages like 1 to 10 pages and store it?

My code is showing the first page of hotels. Why isn't it showing more?
import csv
import requests
from bs4 import BeautifulSoup
hotels=[]
i=0
url0 = 'https://www.tripadvisor.com/Hotels-g295424-Dubai_Emirate_of_Dubai- Hotels.html#EATERY_LIST_CONTENTS'
r = requests.get(url0)
data = r.text
soup = BeautifulSoup(r.text, "html.parser")with open('hotels_Data.csv','wb') as file:
for link in soup.findAll('a', {'property_title'}):
print('https://www.tripadvisor.com/Hotels-g295424-' + link.get('href'))
print(link.string)
for i in range(20):
while int(i) <= (20):
i = str(i)
url1 = 'https://www.tripadvisor.com/Hotels-g295424-oa' + i + '- Dubai_Emirate_of_Dubai-Hotels.html#EATERY_LIST_CONTENTS'
r1 = requests.get(url1)
data1 = r1.text
soup1 = BeautifulSoup(data1, "html.parser")
for link in soup1.findAll('a', {'property_title','price'}):
print('https://www.tripadvisor.com/Hotels-g294212-' + link.get('href'))
print(link.string)
for link in soup.select("a.reference.internal"):
url1 = link["href"]
absolute_url = urljoin(base_url, url1)
print(url1, absolute_url)
writer = csv.writer(file)
for row in hotels:
writer.writerow([s.encode("utf-8") for s in row])
break

Check links to next pages at the bottom of page - this portal doesn't use page numbers - 1, 2, 3, etc. - but offer offset - 0, 30, 60, 90 ect. (because it displays 30 offers on page)
So you have to use values 0, 30, 60, 90, etc. in url
"...-oa" + offset + "-Dubai_Emirate..."
You can use ie. range(0, 250, 30) to get values 0, 30, 60, 90.
import requests
from bs4 import BeautifulSoup
for offset in range(0, 250, 30):
print('--- page offset:', offset, '---')
url = 'https://www.tripadvisor.com/Hotels-g295424-oa' + str(offset) + '-Dubai_Emirate_of_Dubai-Hotels.html#EATERY_LIST_CONTENTS'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.find_all('a', {'property_title'}):
print(link.text)
But there can be more offers than 250 so you have to get link to last page to get correct value instead of 250
import requests
from bs4 import BeautifulSoup
offset = 0
url = 'https://www.tripadvisor.com/Hotels-g295424-oa' + str(offset) + '-Dubai_Emirate_of_Dubai-Hotels.html#EATERY_LIST_CONTENTS'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.find_all('a', {'last'}):
page_number = link.get('data-page-number')
last_offset = int(page_number) * 30
print('last offset:', last_offset)
And use last_offset+1 in range(0, last_offset+1, 30)
EDIT: Restaurants use JavaScript and AJAX to load data
import requests
from bs4 import BeautifulSoup
size = 30
# direct url - doesn't have expected information
#url = 'https://www.tripadvisor.com/Restaurants-g187791-Rome_Lazio.html'
# url used by AJAX
url = 'https://www.tripadvisor.com/RestaurantSearch?Action=PAGE&geo=187791&ajax=1&itags=10591&sortOrder=relevance&o=a' + str(size) + '&availSearchEnabled=true&eaterydate=2017_04_27&date=2017-04-28&time=20%3A00%3A00&people=2'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
link = soup.find_all('a')[-1]
page_number = link.get('data-page-number')
last_offset = int(page_number) * size # *30
print('last offset:', last_offset)
offset = link.get('data-offset')
offset = int(offset) + size # +30
print('offset:', offset)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How can I crawl all the page? - python

Related

Getting a not subscriptable when running a web scraping script

Python: Web Scraping Weird Output

Page loop not working on python webscrape

How to update scraping link in for loop

Scrape hotels from tripadvisor ..how to get hotels from all pages like 1 to 10 pages and store it?

Categories

Resources