Crawl site with infinite scrolling - python

I'm trying to crawl flipkart, but flipkart does not load its page at once. So I'm not able to crawl it. Please help.
from bs4 import BeautifulSoup
import requests
import re
import MySQLdb
import urllib2
import urllib
url = "https://www.flipkart.com/offers-list/weekend-specials?screen=dynamic&pk=contentTheme%3DLS-Nov-Weekend_widgetType%3DdealCard&wid=4.dealCard.OMU&otracker=hp_omu_Weekend+Specials_1"
r = requests.get(url)
soup = BeautifulSoup(r.content,"html.parser")
name=soup.find_all("div",{"class":"iUmrbN"})
for i in name:
print i.text
This is not giving any output.

Related

How to breakup webpage text

I assume I have to use the /br to break up this text and extract the fixtures only, but cannot figure it out at all!
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pandas as pd
import pprint
import json
from urllib.request import urlopen
# sample web page
sample_web_page = 'https://bleacherreport.com/articles/10005879-epl-schedule-2021-22-official-list-of-fixtures-for-new-premier-league-season'
# call get method to request that page
page = requests.get(sample_web_page)
# with the help of beautifulSoup and html parser create soup
soup = BeautifulSoup(page.content, "html.parser")
z = soup.findAll('p', {'class':''})
print(z)

Getting a spesific value from a website in python

I get a product link that is on Aliexpress then I want to get the product price value and print it. I like this:
import requests
import urllib2
import sys
import urllib.parse
import urllib.request
import re
from bs4 import BeautifulSoup as BS
url ="https://www.amazon.com/Optimum-Nutrition-Standard-Naturally-Flavored/dp/B00QQA0H3S?pd_rd_wg=Hiuuc&pd_rd_r=2542737c-992b-4b5c-b7de-05acce3929d5&pd_rd_w=be1cP&ref_=pd_gw_simh&pf_rd_r=WFBB3JV61PKGSDJRW0C1&pf_rd_p=b841581f-e864-5164-afa6-4c18a8348879"
usock = urllib2.urlopen(url)
data = usock.read()
usock.close()
soup = BS(data)
price = soup.findAll("span", {"id": "volume"})[0]
print price
In this Amazon link, I want to find the 55.01 value.(Amazon is a just example.The website does not matter.It may be AliExpress etc.)

Having trouble in getting page source with beautifulsoup

I am trying to get the HTML source of a web page using beautifulsoup.
import bs4 as bs
import requests
import urllib.request
sourceUrl='https://www.pakwheels.com/forums/t/planing-a-trip-from-karachi-to-lahore-by-road-in-feb-2017/414115/2.html'
source=urllib.request.urlopen(sourceUrl).read()
soup=bs.BeautifulSoup(source,'html.parser')
print(soup)
I want the HTML source of the page. This is what I am getting now:
'ps.store("siteSettings", {"title":"PakWheels Forums","contact_email":"sami.ullah#pakeventures.com","contact_url":"https://www.pakwheels.com/main/contact_us","logo_url":"https://www.pakwheels.com/assets/logo.png","logo_small_url":"/images/d-logo-sketch-small.png","mobile_logo_url":"data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz4NCjwhLS0gR2VuZXJhdG9yOiBBZG9iZSBJbGx1c3RyYXRvciAxNi4wLjAsIFNWRyBFeHBvcnQgUGx1Zy1JbiAuIFNWRyBWZXJzaW9uOiA2LjAwIEJ1aWxkIDApICAtLT4NCjwhRE9DVFlQRSBzdmcgUFVCTElDICItLy9XM0MvL0RURCBTVkcgMS4xLy9FTiIgImh0dHA6Ly93d3cudzMub3JnL0dyYXBoaWNzL1NWRy8xLjEvRFREL3N2ZzExLmR0ZCI+DQo8c3ZnIHZlcnNpb249IjEuMSIgaWQ9IkxheWVyXzEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgeG1sbnM6eGxpbms9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkveGxpbmsiIHg9IjBweCIgeT0iMHB4Ig0KCSB3aWR0aD0iMjQwcHgiIGhlaWdodD0iNjBweCIgdmlld0JveD0iMCAwIDI0MCA2MCIgZW5hYmxlLWJhY2tncm91bmQ9Im5ldyAwIDAgMjQwIDYwIiB4bWw6c3BhY2U9InByZXNlcnZlIj4NCjxwYXRoIGZpbGw9IiNGRkZGRkYiIGQ9Ik02LjkwMiwyMy4yODZDMzQuNzc3LDIwLjI2Miw1Ny4yNC'
Have a look at this code:
from urllib import request
from bs4 import BeautifulSoup
url_1 = "http://www.google.com"
page = request.urlopen(url_1)
soup = BeautifulSoup(page)
print(soup.prettify())
Import everything you need correctly. Read this.

BeautifulSoup doesn't extract the table

import urllib.request as urllib2 #To query website
from bs4 import BeautifulSoup #To parse website
import pandas as pd
#specify the url and open
url3 = 'http://www.thatscricket.com/ipl/2014/results/index.html'
req = urllib2.urlopen(url3)
soup = BeautifulSoup(req,"html5lib")
all_tables=soup.find_all('table')
print(all_tables)
If you see the content of your requested data
content = req.readall()
as you examine the content:
print(content)
and surprisingly there is not table!!!
But if you check the page source you can see tables in it.
As I examined there should be some problem with urllib.request and there is some escape sequence on the page which causes that urllib get only part of that page.
So I could be able to fix the problem by using requests instead of urllib
first
pip install requests
Then change your code to this:
import requests
from bs4 import BeautifulSoup
url3 = 'http://www.thatscricket.com/ipl/2014/results/index.html'
req = requests.get(url3)
soup = BeautifulSoup(req.content,"html5lib")
all_tables=soup.find_all('table')
print(all_tables)

Python BeautifulSoup cannot find the data in the page source

in this website I can see the data I want and when I inspect the element I find it ,however it is not there in the page source so I cannot scrape it :
import requests
from bs4 import BeautifulSoup, Tag
from lxml import html
import requests
import MySQLdb
import urllib2
import itertools
import re
import sys
from datetime import date, timedelta as td
urls =("http://euw.lolesports.com/tourney/match/1833")
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(urls,headers=hdr)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
tournament=soup.findAll('div',{'class':['field-item even']})
print tournament
print soup.title.text
match_time=soup.findAll('span',{'class':['local-time']})
for tag in match_time:
time=tag.get('datetime',None)
if time !=None:
print time
vid = soup.findAll('iframe',{'class':['media-youtube-player']})
for tag in vid:
vidlink=tag.get('href',None)
if vidlink !=None:
print vidlink
teams=soup.findAll('h5',{'class':['team-name']})
for tag in teams:
tag.replaceWith('')
print (tag.string)
print soup.findAll('span',{'class':['winner-holder']})
I was able to retrieve the title and teams but with everything else no luck, when I click on each element to inspect it I can see the data there, however when I view the page source they are empty tags which is why I think I am not getting any results.
Is there a way to overcome this ?

Categories

Resources