Can't modify existing logic to parse titles from next pages - python

I've created a script in python using requests module to get the titles of different items populated upon initiating a search in duckduckgo.com. My search keyword is cricket. My script is parsing the titles from the first page flawlessly.
Website address
I'm facing trouble parsing the titles from next pages as the two fields of params are increasing weirdly, as in 's': '0' and 'dc': '-27'. However, the rest of the fields are static.
To parse titles from the first page, I tried like below (working):
import requests
from bs4 import BeautifulSoup
URL = "https://duckduckgo.com/html/"
params = {
'q': 'python',
's': '0',
'nextParams': '',
'v': 'l',
'o': 'json',
'dc': '-27',
'api': 'd.js',
'kl': 'us-en'
}
resp = requests.post(URL,data=params,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(resp.text,"lxml")
for title in soup.select(".result__body .result__a"):
print(title.text)
That two fields of the params are increasing like below:
1st page:
's': '0'
'dc': '-27'
2nd page:
's': '30'
'dc': '27'
Third page:
's': '80'
'dc': '76'
Fourth page:
's': '130'
'dc': '126'
How can I scrape titles from next pages as well?

The params for the next page are held in the POST response each time
import requests
from bs4 import BeautifulSoup
URL = "https://duckduckgo.com/html/"
params = {
'q': 'python',
's': '0',
'nextParams': '',
'v': 'l',
'o': 'json',
'dc': '0',
'api': 'd.js',
'kl': 'us-en'
}
with requests.Session() as s:
while True:
resp = s.post(URL,data=params,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(resp.text,"lxml")
for title in soup.select(".result__body .result__a"):
print(title.text)
for i in soup.select('form:not(.header__form) [type=hidden]'): #updated params based on response
params[i['name']] = i['value']
if not soup.select_one('[value=Next]'):
break

Related

Can't get info of a lxml site with Request and BeautifulSoup

I'm trying to make a testing project that scraps info of a specific site but with no success.
I followed some tutorials i have found and even an post on stackoverflow. After all this I'm stuck!
help me stepbrothers, I'm a hot new programmer with python and I can't stop my projects.
more info: this is a lottery website that I was trying to scrap and make some analisys to get a lucky number.
I have followed this tutorials:
https://towardsdatascience.com/how-to-collect-data-from-any-website-cb8fad9e9ec5
https://beautiful-soup-4.readthedocs.io/en/latest/
Using BeautifulSoup in order to find all "ul" and "li" elements
All of you have my gratitute!
from bs4 import BeautifulSoup as bs
import requests
import html5lib
#import urllib3 # another attemp to make another req in the url ------failed
url = '''https://loterias.caixa.gov.br/Paginas/Mega-Sena.aspx'''
#another try to take results in the <ul> but I have no qualified results == None
def parse_ul(elem):#https://stackoverflow.com/questions/50338108/using-beautifulsoup-in-order-to-find-all-ul-and-li-elements
result = {}
for sub in elem.find_all('li', recursive=False):
if sub.li is None:
continue
data = {k: v for k, v in sub.attrs.items()}
if sub.ul is not None:
# recurse down
data['children'] = parse_ul(sub.ul)
result[sub.li.get_text(strip=True)] = data
return result
page = requests.get(url)#taking info from website
print(page.encoding)# == UTF-8
soup = bs(page.content,features="lxml")#takes all info from the url and organizes it ==Beaultiful soup
numbers = soup.find(id='ulDezenas')#searcher in the content of this specific id// another try: soup.find('ul', {'class': ''})
result = parse_ul(soup)#try to parse info, but none is found EVEN WITH THE ORIGINAL ONE
print(numbers)#The result is below:
'''<ul class="numbers megasena" id="ulDezenas">
<li ng-repeat="dezena in resultado.listaDezenas ">{{dezena.length > 2 ? dezena.slice(1) : dezena}}</li>
</ul>'''
print(result)# == "{}" nothing found
#with open('''D:\Documents\python\_abretesesame.txt''', 'wb') as fd:
# for chunk in page.iter_content(chunk_size=128):
# fd.write(chunk)
# =======printing document(HTML) in file still no success in getting the numbers
Main issue is that the content is provided dynamically by JavaScript but you can get the information via another url:
jsonData = requests.get('https://servicebus2.caixa.gov.br/portaldeloterias/api/megasena/').json()
will give you folowing JSON:
{'tipoJogo': 'MEGA_SENA', 'numero': 2468, 'nomeMunicipioUFSorteio': 'SÃO PAULO, SP', 'dataApuracao': '02/04/2022', 'valorArrecadado': 158184963.0, 'valorEstimadoProximoConcurso': 3000000.0, 'valorAcumuladoProximoConcurso': 0.0, 'valorAcumuladoConcursoEspecial': 36771176.89, 'valorAcumuladoConcurso_0_5': 33463457.98, 'acumulado': False, 'indicadorConcursoEspecial': 1, 'dezenasSorteadasOrdemSorteio': ['022', '041', '053', '042', '035', '057'], 'listaResultadoEquipeEsportiva': None, 'numeroJogo': 2, 'nomeTimeCoracaoMesSorte': '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 'tipoPublicacao': 3, 'observacao': '', 'localSorteio': 'ESPAÇO DA SORTE', 'dataProximoConcurso': '06/04/2022', 'numeroConcursoAnterior': 2467, 'numeroConcursoProximo': 2469, 'valorTotalPremioFaixaUm': 0.0, 'numeroConcursoFinal_0_5': 2470, 'listaDezenas': ['022', '035', '041', '042', '053', '057'], 'listaDezenasSegundoSorteio': None, 'listaMunicipioUFGanhadores': [{'posicao': 1, 'ganhadores': 1, 'municipio': 'SANTOS', 'uf': 'SP', 'nomeFatansiaUL': '', 'serie': ''}], 'listaRateioPremio': [{'faixa': 1, 'numeroDeGanhadores': 1, 'valorPremio': 122627171.8, 'descricaoFaixa': '6 acertos'}, {'faixa': 2, 'numeroDeGanhadores': 267, 'valorPremio': 34158.18, 'descricaoFaixa': '5 acertos'}, {'faixa': 3, 'numeroDeGanhadores': 20734, 'valorPremio': 628.38, 'descricaoFaixa': '4 acertos'}], 'id': None, 'ultimoConcurso': True, 'exibirDetalhamentoPorCidade': True, 'premiacaoContingencia': None}
Simply extract dezenasSorteadasOrdemSorteio and prozess in list comprehension:
[n if len(n) < 2 else n[1:] for n in jsonData['listaDezenas']]
Result will be:
['22', '35', '41', '42', '53', '57']
Example
import requests
jsonData = requests.get('https://servicebus2.caixa.gov.br/portaldeloterias/api/megasena/').json()
print([n if len(n) < 2 else n[1:] for n in jsonData['listaDezenas']])

Scraping data from a site, and it returns nothing?

i'm trying to scrape some data from a site called laced.co.uk, and i'm a tad confused on whats going wrong. i'm new to this, so try and explain it simply (if possible please!). Here is my code ;
from bs4 import BeautifulSoup
import requests
url = "https://www.laced.co.uk/products/nike-dunk-low-retro-black-white?size=7"
result = requests.get(url)
doc = BeautifulSoup(result.text, "html.parser")
prices = doc.find_all(text=" £195 ")
print(prices)
thank you! (the price at time of posting was 195 (it showed as the size 7 buy now price on the page).
The price is loaded within a <script> tag on the page:
<script>
typeof(dataLayer) != "undefined" && dataLayer.push({
'event': 'eec.productDetailImpression',
'page': {
'ecomm_prodid': 'DD1391-100'
},
'ecommerce': {
'detail': {
'actionField': {'list': 'Product Page'},
'products': [{
'name': 'Nike Dunk Low Retro Black White',
'id': 'DD1391-100',
'price': '195.0',
'brand': 'Nike',
'category': 'Dunk, Dunk Low, Mens Nike Dunks',
'variant': 'White',
'list': 'Product Page',
'dimension1': '195.0',
'dimension2': '7',
'dimension3': '190',
'dimension4': '332'
}]
}
}
});
</script>
You can use a regular expression pattern to search for the price. Note, there's no need for BeautifulSoup:
import re
import requests
url = "https://www.laced.co.uk/products/nike-dunk-low-retro-black-white?size=7"
result = requests.get(url)
price = re.search(r"'price': '(.*?)',", result.text).group(1)
print(f"£ {price}")

EBAY Finding API Date Filtering

I am trying to return a list of completed items in a given category using the ebay API. My code seems to be working however the results seem to be very limited (about 100). I was assuming there would be some limitation on how far back the api would go but even just a few days should return thousands of results for this category. Am I missing something in the code or is this just a limitation of the ebay API? I did make sure I was using production and not the sandbox.
So I have realized now that there are multiple pages to my query up to the 100 item / 100 page max. I am now running into issues with the date filtering. I see the filter reference material on site but I am still not getting the result I expect. In the updated query I am trying to pull only items completed yesterday but when running I am getting stuff from today. Is there a better way to input the date filters?
from ebaysdk.finding import Connection as finding
from bs4 import BeautifulSoup
import os
import csv
api = finding(appid=<my appid>,config_file=None)
response = api.execute(
'findCompletedItems', {
'categoryId': '214',
'keywords' : 'prizm',
'endTimeFrom' : '2020-02-03T00:00:00.000Z',
'endTimeTo' : '2020-02-04T00:00:00.000Z' ,
'paginationInput': {
'entriesPerPage': '100',
'pageNumber': '1'
},
'sortOrder': 'EndTimeSoonest'
}
)
soup = BeautifulSoup(response.content , 'lxml')
totalitems = int(soup.find('totalentries').text)
items = soup.find_all('item')
for item in response.reply.searchResult.item:
print(item.itemId)
print(item.listingInfo.endTime)
I finally figured this out. I needed to add additional code for the item filters. The working code is below.
from ebaysdk.finding import Connection as finding
from bs4 import BeautifulSoup
import os
import csv
api = finding(appid=<my appid>,config_file=None)
response = api.execute(
'findCompletedItems', {
'categoryId': '214',
'keywords' : 'prizm',
'itemFilter': [
{'name': 'EndTimeFrom', 'value': '2020-02-03T00:00:00.000Z'},
{'name': 'EndTimeTo', 'value': '2020-02-04T00:00:00.000Z'}
#{'name': 'MinPrice', 'value': '200', 'paramName': 'Currency', 'paramValue': 'GBP'},
#{'name': 'MaxPrice', 'value': '400', 'paramName': 'Currency', 'paramValue': 'GBP'}
],
'paginationInput': {
'entriesPerPage': '100',
'pageNumber': '100'
},
'sortOrder': 'EndTimeSoonest'
}
)
soup = BeautifulSoup(response.content , 'lxml')
totalitems = int(soup.find('totalentries').text)
items = soup.find_all('item')
for item in response.reply.searchResult.item:
print(item.itemId)
print(item.listingInfo.endTime)

Python request: Unable to process a request within iFrame

Before I give up and opt for Selenium route, I was trying to automate this page (yeezysupply.com/products/mens-crepe-boot-oil) via Python requests till checkout but got stuck at Checkout page as Credit Form and it's requests is loaded in iFrame and submit to different url https://elb.deposit.shopifycs.com/sessions where it starts giving 500 Internal Server error. Below is the code pertaining section which I tried and did not work:
payment_method_url = r.url.split('?')
payment_method_url = payment_method_url[0]
BILLING_FIRST_NAME = 'Jon'
BILLING_LAST_NAME = 'Norman'
BILLING_ADDRESS_1 = 'G-309'
BILLING_ADDRESS_2 = 'G-309'
BILLING_CITY = 'Chicago'
BILLING_COUNTRY = 'United States'
BILLING_PROVINCE = 'Illinois'
BILLING_ZIP = '60007'
BILLING_PHONE = '149584848485'
TOTAL_PRICE = 66500
# For final Checkout
s.options('https://elb.deposit.shopifycs.com/sessions', headers=headers)
session_url = 'https://elb.deposit.shopifycs.com/sessions'
session_data = '{"credit_card":{"number":"4242 4242 4242 4242","name":"Jon Norman","month":9,"year":2019,"verification_value":"123"}}'
sleep(3)
s.headers.update({
'referer': 'https://checkout.shopifycs.com/number?identifier=eeb4fe88a0fd4063043eeb5730d460f4&location=https%3A%2F%2Fpurchase.yeezysupply.com%2F17655971%2Fcheckouts%2Feeb4fe88a0fd4063043eeb5730d460f4'})
data = {
'utf8': 'utf8',
'_method': 'patch',
'authenticity_token': authenticity_token,
'previous_step': 'payment_method',
'step': '',
's': 'east-50fb8458975b56217d7317847efb9280',
'checkout[payment_gateway]': '117647559',
'checkout[credit_card][vault]': 'false',
'checkout[payment_gateway]': '117647559',
'checkout[different_billing_address]': 'true',
'checkout[billing_address][first_name]': BILLING_FIRST_NAME,
'checkout[billing_address][last_name]': BILLING_LAST_NAME,
'checkout[billing_address][address1]': BILLING_ADDRESS_1,
'checkout[billing_address][address2]': BILLING_ADDRESS_2,
'checkout[billing_address][city]': BILLING_CITY,
'checkout[billing_address][country]': BILLING_COUNTRY,
'checkout[billing_address][province]': BILLING_PROVINCE,
'checkout[billing_address][zip]': BILLING_ZIP,
'checkout[billing_address][remember_me]': 'false',
'checkout[billing_address][remember_me]': '0',
'checkout[billing_address][remember_me_country_code]': '',
'checkout[billing_address][remember_me_phone]': '',
'checkout[billing_address][total_price]': TOTAL_PRICE,
'complete': '1',
'checkout[client_details][browser_width]': '1280',
'checkout[client_details][browser_height]': '150',
'checkout[client_details][javascript_enabled]': '1',
}
sleep(2)
r = s.post(payment_method_url+'/processing', data=data, headers=headers)
r = s.get(payment_method_url, headers=headers)
print(r.text)

Parsing URL through Python

I need to parse
http://www.webpagetest.org/breakdown.php?test=150325_34_0f581da87c16d5aac4ecb7cd07cda921&run=2&cached=0
If you view source of the above url you will find
Expected Output:
fvRequests= css
fvRequests=7
import re
import urllib2
if __name__ == "__main__":
url = 'http://www.webpagetest.org/breakdown.php?test=150325_34_0f581da87c16d5aac4ecb7cd07cda921&run=2&cached=0'
# http request
response = urllib2.urlopen(url)
html = response.read()
response.close()
# finding values in html
results = re.findall(r'fvRequests\.setValue\(\d+, \d+, \'?(.*?)\'?\);', html)
keys = results[::2]
values = results[1::2]
# creating a dictionary
output = dict(zip(keys, values))
print output
The idea is to locate the script with BeautifulSoup and use regular expression pattern to find the the fvRequests.setValue() calls and extract the value of the third argument:
import re
from bs4 import BeautifulSoup
import requests
pattern = re.compile(r"fvRequests\.setValue\(\d+, \d+, '?(\w+)'?\);")
response = requests.get("http://www.webpagetest.org/breakdown.php?test=150325_34_0f581da87c16d5aac4ecb7cd07cda921&run=2&cached=0")
soup = BeautifulSoup(response.content)
script = soup.find("script", text=lambda x: x and "fvRequests.setValue" in x).text
print(re.findall(pattern, script))
Prints:
[u'css', u'7', u'flash', u'0', u'font', u'0', u'html', u'14', u'image', u'80', u'js', u'35', u'other', u'14']
You can go further and pack the list into a dict (solution taken from here):
dict(zip(*([iter(data)] * 2)))
which would produce:
{
'image': '80',
'flash': '0',
'js': '35',
'html': '14',
'font': '0',
'other': '14',
'css': '7'
}

Categories

Resources