Not able to extract data or find problem with code

Not able to extract data or find problem with code - python

Below is the code for extracting news articles for different companies from Google news. This gives me an empty excel file with only the headers. I am not able to figure out what the error is. Can someone please help out. (The entire code can be copy-pasted, and reproduced in your machine):
import requests
import random
from collections import OrderedDict
def list_header():
headers_list = [
# Firefox 24 Linux
{
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
},
# Firefox Mac
{
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}]
return headers_list
def list_dict():
# Get headers list
headers_list = list_header()
# Create ordered dict from Headers above
ordered_headers_list = []
for headers in headers_list:
h = OrderedDict()
for header,value in headers.items():
h[header]=value
ordered_headers_list.append(h)
return ordered_headers_list
def list_test():
headers_list = list_dict()
max = len(headers_list)
url = 'https://httpbin.org/headers'
for i in range(0,max):
#Pick a random browser headers
headers = random.choice(headers_list)
#Create a request session
r = requests.Session()
r.headers = headers
response = r.get(url)
print("Request #%d\nUser-Agent Sent:%s\n\nHeaders Recevied by HTTPBin:"%(i,headers['User-Agent']))
print(response.json())
print("-------------------")
def random_header():
headers_list = list_dict()
headers = random.choice(headers_list)
return headers
import pandas as pd
def ingest_google_news():
ticker_list = ['AAPL.O', 'MSFT', 'GOOG', '2222.SR', 'AMZN', 'FB', 'TSLA', 'BRK-A', 'TCEHY', 'TSM', 'NVDA', 'JPM', 'BABA', 'V', 'JNJ', '005930.KS', 'WMT', 'LVMUY']
sep = '.'
df = pd.DataFrame()
t_news = []
t_publisher = []
t_urls = []
t_dates = []
t_tickers = []
for t in ticker_list:
news = []
publisher = []
urls = []
dates = []
tickers = []
# cleaning ticker
ticker = t
t = t.split(sep, 1)[0]
# set header by random user agent
r = requests.Session()
headers = random_header()
r.headers = headers
# print(headers)
# set query for google
query = '{} news'.format(t)
url = f"https://www.google.com/search?q={query}&tbm=nws&lr=lang_en&hl=en&sort=date&num=5"
res = r.get(url, headers=headers)
soup = bs4.BeautifulSoup(res.text, "html.parser")
links = soup.select(".dbsr a")
for l in links:
tickers.append(t)
try:
url_w = l.get("href")
print(url_w)
urls.append(url_w)
dt = find_date(url_w)
dates.append(dt)
res = requests.get(url_w, headers=headers)
parsed_article = bs4.BeautifulSoup(res.text,'lxml')
paragraphs = parsed_article.find_all('p')
article_text = ""
for p in paragraphs:
article_text += p.text
except Exception as e:
article_text = ''
news.append(article_text)
sources = soup.select(".XTjFC g-img")
for s in sources:
publisher.append(s.next_sibling.lower())
t_urls += urls
t_news += news
t_publisher += publisher
t_dates += dates
t_tickers += tickers
df['ticker'] = t_tickers
df['links'] = t_urls
df['article_text'] = t_news
df['publisher'] = t_publisher
df['created_at'] = t_dates
# import to csv
today = datetime.date.today()
d1 = today.strftime("%d%m%Y")
df.to_csv(f'/content/drive/MyDrive/google_news_{d1}.csv')
del news, publisher, urls, dates, tickers
del t_news, t_publisher, t_urls, t_dates, t_tickers
import bs4
from bs4 import BeautifulSoup
import datetime
ingest_google_news()
The code above is from the following link: https://medium.com/analytics-vidhya/google-scraping-using-beautifulsoup-d53746ef5a32

Have a look at the SelectorGadget Chrome extension to grab CSS selectors by clicking on the desired element in your browser. CSS selectors reference.
Code and example in the online IDE (extracts the title, link, snippet, date published, source, and stores to CSV):
from bs4 import BeautifulSoup
import requests, lxml
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
ticker_list = ['AAPL.O', 'LVS', 'COTY.K','JPM', 'XOM', '005930.KS']
def get_news():
# iterate over each ticker
for news in ticker_list:
params = {
"q": news, # query
"hl": "en", # language
"gl": "us", # country to search from
"tbm": "nws", # google news filter
}
# store news data
news_data = []
html = requests.get('https://www.google.com/search', headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
# container with needed data
for result in soup.select('.WlydOe'):
title = result.select_one('.nDgy9d').text
link = result['href']
source = result.select_one('.CEMjEf span').text
snippet = result.select_one('.GI74Re').text
date_published = result.select_one('p.S1FAPd').text
print(f'{title}\n{link}\n{snippet}\n{date_published}\n{source}\n')
# append news data to list in dict() format to save it later via pandas
news_data.append({
'title': title,
'link': link,
'snippet': snippet,
'date_published': date_published,
'source': source
})
# create DataFrame from the list()
df = pd.DataFrame(news_data)
# save DataFrame to csv without default index column on the left side
df.to_csv('bs4_final.csv', index=False)
get_news()
----------
'''
EU plans to legislate for common phone charger despite ...
https://www.reuters.com/technology/eu-plans-legislate-common-phone-charger-despite-apple-grumbles-2021-09-21/
... a person familiar with the matter said - a move likely to affect iPhone
maker Apple (AAPL.O) more than its rivals.
2 hours ago
Reuters
Wall Street ends sharply lower in broad sell-off
https://www.reuters.com/business/wall-street-ends-sharply-lower-broad-sell-off-2021-09-20/
O), Amazon.com Inc (AMZN.O), Apple Inc (AAPL.O), Facebook Inc (FB. ... O)
were among the biggest drags on the index as well as the S&P 500.
18 hours ago
Reuters
... other results
'''
Alternatively, you can achieve the same thing by using Google News Results API from SerpApi. It's a paid API with a free plan.
The main difference in your example is that you don't need to make things that complicated, figure out why things just don't work as expected, and then maintain it over time. Instead, you only need to iterate over structured JSON and get what you want, fast.
Code to integrate (same process for saving to csv as with code above):
import os
from serpapi import GoogleSearch
ticker_list = ['AAPL.O', 'LVS', 'COTY.K','JPM', 'XOM', '005930.KS']
def get_news():
for news in ticker_list:
params = {
"engine": "google",
"q": news,
"gl": "us",
"tbm": "nws",
"api_key": os.getenv("API_KEY"), # API environment variable
}
search = GoogleSearch(params)
results = search.get_dict()
for news_result in results["news_results"]:
print(f"Title: {news_result['title']}\nLink: {news_result['link']}\n")
get_news()
-----------
'''
Title: EU plans to legislate for common phone charger despite ...
Link: https://www.reuters.com/technology/eu-plans-legislate-common-phone-charger-despite-apple-grumbles-2021-09-21/
Title: Wall Street ends sharply lower in broad sell-off
Link: https://www.reuters.com/business/wall-street-ends-sharply-lower-broad-sell-off-2021-09-20/
Title: S&P 500 down more than 2% as growth worries rise
Link: https://www.reuters.com/business/sp-500-down-more-than-2-growth-worries-rise-2021-09-20/
Title: Apple joins streaming elite, Netflix crosses milestone with ...
Link: https://www.reuters.com/technology/apple-joins-streaming-elite-netflix-crosses-milestone-with-emmy-wins-2021-09-20/
... other results
'''
Disclaimer, I work for SerpApi.

Related

Why is this web scrape not working on python?

I haven’t recently been using the code attached. For the past few weeks, it has been working completely fine and always produced results. However, I used this today and for some reason it didn’t work. Could you please help and provide a solution to the problem.
import requests, json
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {"q": "dji", "hl": "en", 'gl': 'us', 'tbm': 'shop'}
response = requests.get("https://www.google.com/search",
params=params,
headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
# list with two dict() combined
shopping_data = []
shopping_results_dict = {}
for shopping_result in soup.select('.sh-dgr__content'):
title = shopping_result.select_one('.Lq5OHe.eaGTj h4').text
product_link = f"https://www.google.com{shopping_result.select_one('.Lq5OHe.eaGTj')['href']}"
source = shopping_result.select_one('.IuHnof').text
price = shopping_result.select_one('span.kHxwFf span').text
try:
rating = shopping_result.select_one('.Rsc7Yb').text
except:
rating = None
try:
reviews = shopping_result.select_one('.Rsc7Yb').next_sibling.next_sibling
except:
reviews = None
try:
delivery = shopping_result.select_one('.vEjMR').text
except:
delivery = None
shopping_results_dict.update({
'shopping_results': [{
'title': title,
'link': product_link,
'source': source,
'price': price,
'rating': rating,
'reviews': reviews,
'delivery': delivery,
}]
})
shopping_data.append(dict(shopping_results_dict))
print(title)

Because .select in for shopping_result in soup.select('.sh-dgr__content'): could not find any element so it gives you an empty list. Therefor the body of the for-loop is not executed. Python jumps out of the loop.
title only exists and is defined when the body of the for loop executes.
You should make sure you used a correct method to find your element(s).

Formatting scraped data Python Beautifulsoup

I am trying to scrape from this URL https://www.spoonflower.com/en/shop?on=fabric design names, creator names, fabric types, prices as per fabric type
The good thing is they have public API endpoints which make the data extraction simple
But the problem is they have different URLs for design names and for pricing
i.e to collect names of design and the creator name I have to ping this URL https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
And for pricing per fabric type requesting this endpoint
https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en
I am getting correct data but the problem I stumbled across some formatting issues.
What I am looking for is something like this.
Each design with its fabric type alongside its prices in a single row. Instead, I am getting this kind of output
It would be great if anyone here can guide me through this like how to get the expected_output_result I am looking for.
Below is my code:
import requests
from bs4 import BeautifulSoup
import json
import csv
cookies = {
'b': '1.2qu49mazdxsj0.40fc8b88.quqq3d.9q7z',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'X-Spoonflower-Window-UUID': 'a9bc37a2-9eb2-4a1e-8ea1-fcee89347364',
'Content-Type': 'application/json',
'Origin': 'https://www.spoonflower.com',
'Connection': 'keep-alive',
'Referer': 'https://www.spoonflower.com/',
'Sec-GPC': '1',
'If-None-Match': 'W/95d6572c326b81ce98c7ae27ac449d42',
'TE': 'Trailers',
}
res = requests.get('https://www.spoonflower.com/spoonflower_fabrics')
soup = BeautifulSoup(res.text, 'lxml')
fabrics = [fabric.find('h2').text.strip() for fabric in soup.find_all('div', {'class': 'product_detail medium_text'})]
fabric = []
for fab in fabrics:
fabric.append(("_".join(fab.upper().split())))
#https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
#https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en
item_endpoint = 'https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en'
item_response = requests.get(item_endpoint).json()
#item_data = items_json['page_results'][0]
scraped_items = []
for item in item_response['page_results']:
for fab_type in fabric:
details_endpoint = 'https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_' + fab_type + '?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id='+ str(item['designId']) + '&page_locale=en'
details_endpoint_response = requests.get(details_endpoint, headers=headers, cookies=cookies).json()
designName = item['name'],
screeName = item['user']['screenName']
fabric_name = details_endpoint_response['data']['fabric_code']
try:
test_swatch_meter = details_endpoint_response['data']['pricing']['TEST_SWATCH_METER']['price']
except:
test_swatch_meter = 'N/A'
try:
fat_quarter_meter = details_endpoint_response['data']['pricing']['FAT_QUARTER_METER']['price']
except:
fat_quarter_meter = 'N/A'
try:
meter = details_endpoint_response['data']['pricing']['METER']['price']
except:
meter = 'N/A'
scraped_items.append({
'designName': designName,
'screenName': screeName,
'fabric_name': fabric_name,
'test_swatch_meter': test_swatch_meter,
'fat_quarter_meter': fat_quarter_meter,
'meter': meter
})
print(designName, screeName, fabric_name, test_swatch_meter,fat_quarter_meter, meter)
print(json.dumps(scraped_items, indent=2))
#print(type(details_endpoint))
#print(type(items_json['page_results'][0]))
with open('scraped_data.csv', 'w', newline='') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=scraped_items[0].keys())
writer.writeheader()
for row in scraped_items:
writer.writerow(row)
#print(fabric)

One way to do it is just reconfigure how you construct the output. Instead of a list, use a dictionary where designName, screenName, followed by the values. One thing to keep in mind is dictionaries don't allow duplicate keys, so had to number the column names, however you can remove those later if you'd like.
See if this gets what you are wanting:
import requests
from bs4 import BeautifulSoup
import json
import csv
import pandas as pd
from collections import OrderedDict
cookies = {
'b': '1.2qu49mazdxsj0.40fc8b88.quqq3d.9q7z',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'X-Spoonflower-Window-UUID': 'a9bc37a2-9eb2-4a1e-8ea1-fcee89347364',
'Content-Type': 'application/json',
'Origin': 'https://www.spoonflower.com',
'Connection': 'keep-alive',
'Referer': 'https://www.spoonflower.com/',
'Sec-GPC': '1',
'If-None-Match': 'W/95d6572c326b81ce98c7ae27ac449d42',
'TE': 'Trailers',
}
res = requests.get('https://www.spoonflower.com/spoonflower_fabrics')
soup = BeautifulSoup(res.text, 'lxml')
fabrics = [fabric.find('h2').text.strip() for fabric in soup.find_all('div', {'class': 'product_detail medium_text'})]
fabric = []
for fab in fabrics:
fabric.append(("_".join(fab.upper().split())))
#https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
#https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en
item_endpoint = 'https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en'
item_response = requests.get(item_endpoint).json()
#item_data = items_json['page_results'][0]
items_dict = OrderedDict()
for item in item_response['page_results']:
for fab_type in fabric:
details_endpoint = 'https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_' + fab_type + '?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id='+ str(item['designId']) + '&page_locale=en'
details_endpoint_response = requests.get(details_endpoint, headers=headers, cookies=cookies).json()
designName = item['name']
screenName = item['user']['screenName']
fabric_name = details_endpoint_response['data']['fabric_code']
try:
test_swatch_meter = details_endpoint_response['data']['pricing']['TEST_SWATCH_METER']['price']
except:
test_swatch_meter = 'N/A'
try:
fat_quarter_meter = details_endpoint_response['data']['pricing']['FAT_QUARTER_METER']['price']
except:
fat_quarter_meter = 'N/A'
try:
meter = details_endpoint_response['data']['pricing']['METER']['price']
except:
meter = 'N/A'
if (designName, screenName) not in items_dict.keys():
items_dict[(designName, screenName)] = {}
itemCount = len(items_dict[(designName, screenName)].values()) / 4
items_dict[(designName, screenName)].update({'fabric_name_%02d' %itemCount: fabric_name,
'test_swatch_meter_%02d' %itemCount: test_swatch_meter,
'fat_quarter_meter_%02d' %itemCount: fat_quarter_meter,
'meter_%02d' %itemCount: meter})
print(designName, screenName, fabric_name, test_swatch_meter,fat_quarter_meter, meter)
df = pd.DataFrame.from_dict(items_dict, orient='index').reset_index(drop=False)
df = df.rename(columns={'level_0':'designName','level_1':'screenName'})
df.to_csv('scraped_data.csv', index=False)

webscraping script for booking.com doesn't work

I made a script to scrape hotel name, rating and perks from hotels on this page : link
Here's my script :
import numpy as np
import time
from random import randint
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re
import random
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
'Referer': 'https://www.espncricinfo.com/',
'Upgrade-Insecure-Requests': '1',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
url0 = 'https://www.booking.com/searchresults.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city&'
links1 = []
results = requests.get(url0, headers = headers)
soup = BeautifulSoup(results.text, "html.parser")
links1 = [a['href'] for a in soup.find("div", {"class": "hotellist sr_double_search"}).find_all('a', class_ = 'js-sr-hotel-link hotel_name_link url', href=True)]
root_url = 'https://www.booking.com/'
urls1 = [ '{root}{i}'.format(root=root_url, i=i) for i in links1 ]
pointforts = []
hotels = []
notes = []
for url in urls1:
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
try :
div = soup.find("div", {"class": "hp_desc_important_facilities clearfix hp_desc_important_facilities--bui"})
pointfort = [x['data-name-en'] for x in div.select('div[class*="important_facility"]')]
pointforts.append(pointfort)
except:
pointforts.append('Nan')
try:
note = soup.find('div', class_ = 'bui-review-score__badge').text
notes.append(note)
except:
notes.append('Nan')
try:
hotel = soup.find("h2",attrs={"id":"hp_hotel_name"}).text.strip("\n").split("\n")[1]
hotels.append(hotel)
except:
hotels.append('Nan')
data = pd.DataFrame({
'Notes' : notes,
'Points fort' : pointforts,
'Nom' : hotels})
#print(data.head(20))
data.to_csv('datatest.csv', sep=';', index=False, encoding = 'utf_8_sig')
It worked, I made a loop to scrape all the links for the hotel and after scrape ratings and perks for all of those hotels. But I had doublons, so instead of :
links1 = [a['href'] for a in soup.find("div", {"class": "hotellist sr_double_search"}).find_all('a', href=True)]
I put : links1 = [a['href'] for a in soup.find("div", {"class": "hotellist sr_double_search"}).find_all('a', class_ = 'js-sr-hotel-link hotel_name_link url', href=True)] as you can see in my script above.
But now it doesn't work anymore, I obtained only Nan, while before, when I had doublons, I have some with Nan but most of them have the perks I wanted and the ratings. I don't understand why.
Here's the html for the hotels links :
hotellink
Here's the html to get the name (after I obtaine the link, the script go to this link) :
namehtml
And here's the html to get all the perks related to the hotel (Like the name, the script go to the link I scraped before) :
perkshtml
And here's my result...
output

The href tags on that website contain newlines. One at the start and also some mid way through. As such when you try and combine root_url you are not getting valid URLs.
A fix can be to remove all the newlines. As the href always starts with a / this can also be removed from the root_url, or you could use urllib.parse.urljoin().
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
'Referer': 'https://www.espncricinfo.com/',
'Upgrade-Insecure-Requests': '1',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
url0 = 'https://www.booking.com/searchresults.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city&'
results = requests.get(url0, headers = headers)
soup = BeautifulSoup(results.text, "html.parser")
links1 = [a['href'].replace('\n','') for a in soup.find("div", {"class": "hotellist sr_double_search"}).find_all('a', class_ = 'js-sr-hotel-link hotel_name_link url', href=True)]
root_url = 'https://www.booking.com'
urls1 = [f'{root_url}{i}' for i in links1]
pointforts = []
hotels = []
notes = []
for url in urls1:
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
try:
div = soup.find("div", {"class": "hp_desc_important_facilities clearfix hp_desc_important_facilities--bui"})
pointfort = [x['data-name-en'] for x in div.select('div[class*="important_facility"]')]
pointforts.append(pointfort)
except:
pointforts.append('Nan')
try:
note = soup.find('div', class_ = 'bui-review-score__badge').text
notes.append(note)
except:
notes.append('Nan')
try:
hotel = soup.find("h2",attrs={"id":"hp_hotel_name"}).text.strip("\n").split("\n")[1]
hotels.append(hotel)
except:
hotels.append('Nan')
data = pd.DataFrame({
'Notes' : notes,
'Points fort' : pointforts,
'Nom' : hotels})
#print(data.head(20))
data.to_csv('datatest.csv', sep=';', index=False, encoding = 'utf_8_sig')
This would give you an output CSV file starting:
Notes;Points fort;Nom
8,3 ;['Parking (fee required)', 'Free WiFi Internet Access Included', 'Family Rooms', 'Airport Shuttle', 'Non Smoking Rooms', '24 hour Front Desk', 'Bar'];Elysées Union
8,4 ;['Free WiFi Internet Access Included', 'Family Rooms', 'Non Smoking Rooms', 'Pets allowed', '24 hour Front Desk', 'Rooms/Facilities for Disabled'];Hyatt Regency Paris Etoile
8,3 ;['Free WiFi Internet Access Included', 'Family Rooms', 'Non Smoking Rooms', 'Pets allowed', 'Restaurant', '24 hour Front Desk', 'Bar'];Pullman Paris Tour Eiffel
8,7 ;['Free WiFi Internet Access Included', 'Non Smoking Rooms', 'Restaurant', '24 hour Front Desk', 'Rooms/Facilities for Disabled', 'Elevator', 'Bar'];citizenM Paris Gare de Lyon

Href attribute not showing for <a> tag when using requests.post

I am trying to download the .csv file that appears on this page when I submit the data as
Data for : Security-wise Price volume & Deliverable position data
Symbol : 3INFOTECH
Select Series : All
Period : 24 months
My code is
symbol = "3IINFOTECH"
url = "https://www.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp"
data = {
"dataType":"priceVolumeDeliverable",
"symbol":symbol,
"segmentLink":"3",
"symbolCount":"2",
"series":"ALL",
"rdPeriod":"groupPeriod",
"dateRange":"24month"
}
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
}
print("fetching for " + symbol)
session = requests.session()
response = requests.post(url, data, headers = headers)
html_content = response.text
soup = BeautifulSoup(html_content, "html.parser")
download_link = soup.findAll("span", attrs = {"class":"download-data-link"})[0]
print(download_link.a["href"])
Now on inspecting element I see this
How do I download the csv file? The post request from my code does not show me the href attribute.

To get the link you have to click the button, so you could use selenium or something equivalent but it is very easy to just parse the data yourself as all you get back from the post request is the data:
symbol = "3IINFOTECH"
url = "https://www.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp"
data = {
"dataType": "priceVolumeDeliverable",
"symbol": symbol,
"segmentLink": "3",
"symbolCount": "2",
"series": "ALL",
"rdPeriod": "groupPeriod",
"dateRange": "24month"
}
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
}
print("fetching for " + symbol)
import csv
response = requests.post(url, data, headers=headers)
html_content = response.text
soup = BeautifulSoup(html_content, "html.parser")
cols = [th.text for th in soup.select("th")]
rows = ([td.text for td in row.select("td")] for row in soup.select("tr + tr"))
with open("data.csv", "w") as f:
wr = csv.writer(f)
wr.writerow(cols)
wr.writerows(rows)
A snippet of data.csv:
Symbol,Series,Date,Prev Close,Open Price,High Price,Low Price,Last Price,Close Price,VWAP,Total Traded Quantity,Turnover in Lacs,No. of Trades,DeliverableQty,% Dly Qt toTraded Qty
3IINFOTECH,EQ,23-May-2014,9.90,10.25,10.70,9.70,10.10,10.10,10.23,"84,99,408",869.20,"16,539","40,35,648",47.48
3IINFOTECH,EQ,26-May-2014,10.10,10.40,10.60,9.10,9.30,9.20,9.97,"59,15,990",589.88,"9,894","27,10,021",45.81
3IINFOTECH,EQ,27-May-2014,9.20,9.20,9.30,8.30,8.60,8.55,8.53,"34,95,072",298.18,"3,600","14,71,141",42.09
3IINFOTECH,EQ,28-May-2014,8.55,8.60,9.40,8.45,9.30,9.15,9.07,"36,09,261",327.27,"3,955","13,92,733",38.59
3IINFOTECH,EQ,29-May-2014,9.15,9.25,9.50,8.80,9.40,9.35,9.28,"30,13,036",279.69,"3,090","15,20,654",50.47
3IINFOTECH,EQ,30-May-2014,9.35,9.35,9.55,8.90,9.00,9.00,9.13,"13,97,140",127.53,"1,992","7,43,964",53.25

How to return only today and yesterday's information that published using POST requests

I need to get the information that published today and a day before. Also when importing it to a csv file it only print the first column not the remained ones.
The URL: https://e-mehkeme.gov.az/Public/Cases
The dates stored in html as <td style="width:95px;text-align:center">28.10.2019</td>
import requests, re
from bs4 import BeautifulSoup as bs
import csv
request_headers = {
'authority': 'e-mehkeme.gov.az',
'method': 'POST',
'path': '/Public/Cases',
'scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
'application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en,en-GB;q=0.9',
'cache-control': 'max-age=0',
'content-length': '66',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://e-mehkeme.gov.az',
'referer': 'https://e-mehkeme.gov.az/Public/Cases',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/75.0.3770.142 Safari/537.36',
}
voens = {'3100608381',
}
form_data = {
'CourtId': '',
'CaseNo': '',
'DocFin': '',
'DocSeries': '',
'DocNumber': '',
'VOEN': voens,
'button': 'Search',
}
url = 'https://e-mehkeme.gov.az/Public/Cases?courtid='
response = requests.post(url, data=form_data, headers=request_headers)
s = bs(response.content, 'lxml')
# PRINT THE CONTENTS OF EACH SEARCH!
for voen in voens:
form_data['VOEN'] = voen
r = requests.post('https://e-mehkeme.gov.az/Public/Cases', data=form_data)
soup = bs(r.text, 'lxml')
ids = [i['value'] for i in soup.select('.casedetail')]
for i in ids:
r = requests.get(f'https://e-mehkeme.gov.az/Public/CaseDetail?caseId={i}')
soup = bs(r.content, 'lxml')
output = [re.sub('\s+', ' ', i.text.strip()) for i in soup.select('[colspan="4"]')]
print(output)
with open('courtSearch.csv', 'w', newline='', encoding='utf-8') as myfile:
writer = csv.writer(myfile, quoting=csv.QUOTE_ALL)
writer.writerow(output)
DESIRED OUTPUT:

The following uses a slightly different url construct so you can use a GET request and easily gather all pages of results per voen. I gather the string dates and caseIds (required for later requests) during each request. I then use a mask (for days of interest e.g. today and yesterday, converted to strings of same format as on website) to filter for only the ids within desired date range. I then loop that filtered list and issue requests for the pop-up window info.
Within the code you can also see commented out sections. One of which shows you the results retrieved from each page
#print(pd.read_html(str(soup.select_one('#Cases')))[0]) ##view table
I am splitting on the header phrases (so assuming these are regular) such that I can split each string from row into the appropriate output columns.
Possiby requires bs4 4.7.1 +
import requests,re, csv
from bs4 import BeautifulSoup as bs
from datetime import datetime, timedelta
import pandas as pd
headers = ['Ətraflı məlumat: ', 'Cavabdeh: ', 'İddiaçı: ', 'İşin mahiyyəti ']
voens = ['2002283071','1303450301', '1700393071']
number_of_past_days_plus_today = 2
mask = [datetime.strftime(datetime.now() - timedelta(day_no), '%d.%m.%Y') for day_no in range(0, number_of_past_days_plus_today)]
ids = []
table_dates = []
with requests.Session() as s:
for voen in voens:
#print(voen) ##view voen
page = 1
while True:
r = s.get(f'https://e-mehkeme.gov.az/Public/Cases?page={page}&voen={voen}') #to get all pages of results
soup = bs(r.text, 'lxml')
ids.extend([i['value'] for i in soup.select('.casedetail')])
#print(pd.read_html(str(soup.select_one('#Cases')))[0]) ##view table
table_dates.extend([i.text.strip() for i in soup.select('#Cases td:nth-child(2):not([colspan])')])
if soup.select_one('[rel=next]') is None:
break
page+=1
pairs = list(zip(table_dates,ids))
filtered = [i for i in pairs if i[0] in mask]
#print(100*'-') ##spacing
#print(filtered) ##view final filtered list of ids
results = []
for j in filtered:
r = s.get(f'https://e-mehkeme.gov.az/Public/CaseDetail?caseId={j[1]}')
soup = bs(r.content, 'lxml')
line = ' '.join([re.sub('\s+',' ',i.text.strip()) for i in soup.select('[colspan="4"]')])
row = re.split('|'.join(headers),line)
results.append(row[1:])
with open("results.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ",", quoting=csv.QUOTE_MINIMAL)
w.writerow(headers)
for row in results:
w.writerow(row)
I searched for splitting on multiple delimiters and used the idea given by #Jonathan here. So upvoted for credit to that user.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Not able to extract data or find problem with code - python

Related

Why is this web scrape not working on python?

Formatting scraped data Python Beautifulsoup

webscraping script for booking.com doesn't work

Href attribute not showing for <a> tag when using requests.post

How to return only today and yesterday's information that published using POST requests

Categories

Resources