Formatting scraped data Python Beautifulsoup

Formatting scraped data Python Beautifulsoup - python

I am trying to scrape from this URL https://www.spoonflower.com/en/shop?on=fabric design names, creator names, fabric types, prices as per fabric type
The good thing is they have public API endpoints which make the data extraction simple
But the problem is they have different URLs for design names and for pricing
i.e to collect names of design and the creator name I have to ping this URL https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
And for pricing per fabric type requesting this endpoint
https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en
I am getting correct data but the problem I stumbled across some formatting issues.
What I am looking for is something like this.
Each design with its fabric type alongside its prices in a single row. Instead, I am getting this kind of output
It would be great if anyone here can guide me through this like how to get the expected_output_result I am looking for.
Below is my code:
import requests
from bs4 import BeautifulSoup
import json
import csv
cookies = {
'b': '1.2qu49mazdxsj0.40fc8b88.quqq3d.9q7z',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'X-Spoonflower-Window-UUID': 'a9bc37a2-9eb2-4a1e-8ea1-fcee89347364',
'Content-Type': 'application/json',
'Origin': 'https://www.spoonflower.com',
'Connection': 'keep-alive',
'Referer': 'https://www.spoonflower.com/',
'Sec-GPC': '1',
'If-None-Match': 'W/95d6572c326b81ce98c7ae27ac449d42',
'TE': 'Trailers',
}
res = requests.get('https://www.spoonflower.com/spoonflower_fabrics')
soup = BeautifulSoup(res.text, 'lxml')
fabrics = [fabric.find('h2').text.strip() for fabric in soup.find_all('div', {'class': 'product_detail medium_text'})]
fabric = []
for fab in fabrics:
fabric.append(("_".join(fab.upper().split())))
#https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
#https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en
item_endpoint = 'https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en'
item_response = requests.get(item_endpoint).json()
#item_data = items_json['page_results'][0]
scraped_items = []
for item in item_response['page_results']:
for fab_type in fabric:
details_endpoint = 'https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_' + fab_type + '?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id='+ str(item['designId']) + '&page_locale=en'
details_endpoint_response = requests.get(details_endpoint, headers=headers, cookies=cookies).json()
designName = item['name'],
screeName = item['user']['screenName']
fabric_name = details_endpoint_response['data']['fabric_code']
try:
test_swatch_meter = details_endpoint_response['data']['pricing']['TEST_SWATCH_METER']['price']
except:
test_swatch_meter = 'N/A'
try:
fat_quarter_meter = details_endpoint_response['data']['pricing']['FAT_QUARTER_METER']['price']
except:
fat_quarter_meter = 'N/A'
try:
meter = details_endpoint_response['data']['pricing']['METER']['price']
except:
meter = 'N/A'
scraped_items.append({
'designName': designName,
'screenName': screeName,
'fabric_name': fabric_name,
'test_swatch_meter': test_swatch_meter,
'fat_quarter_meter': fat_quarter_meter,
'meter': meter
})
print(designName, screeName, fabric_name, test_swatch_meter,fat_quarter_meter, meter)
print(json.dumps(scraped_items, indent=2))
#print(type(details_endpoint))
#print(type(items_json['page_results'][0]))
with open('scraped_data.csv', 'w', newline='') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=scraped_items[0].keys())
writer.writeheader()
for row in scraped_items:
writer.writerow(row)
#print(fabric)

One way to do it is just reconfigure how you construct the output. Instead of a list, use a dictionary where designName, screenName, followed by the values. One thing to keep in mind is dictionaries don't allow duplicate keys, so had to number the column names, however you can remove those later if you'd like.
See if this gets what you are wanting:
import requests
from bs4 import BeautifulSoup
import json
import csv
import pandas as pd
from collections import OrderedDict
cookies = {
'b': '1.2qu49mazdxsj0.40fc8b88.quqq3d.9q7z',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'X-Spoonflower-Window-UUID': 'a9bc37a2-9eb2-4a1e-8ea1-fcee89347364',
'Content-Type': 'application/json',
'Origin': 'https://www.spoonflower.com',
'Connection': 'keep-alive',
'Referer': 'https://www.spoonflower.com/',
'Sec-GPC': '1',
'If-None-Match': 'W/95d6572c326b81ce98c7ae27ac449d42',
'TE': 'Trailers',
}
res = requests.get('https://www.spoonflower.com/spoonflower_fabrics')
soup = BeautifulSoup(res.text, 'lxml')
fabrics = [fabric.find('h2').text.strip() for fabric in soup.find_all('div', {'class': 'product_detail medium_text'})]
fabric = []
for fab in fabrics:
fabric.append(("_".join(fab.upper().split())))
#https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
#https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en
item_endpoint = 'https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en'
item_response = requests.get(item_endpoint).json()
#item_data = items_json['page_results'][0]
items_dict = OrderedDict()
for item in item_response['page_results']:
for fab_type in fabric:
details_endpoint = 'https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_' + fab_type + '?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id='+ str(item['designId']) + '&page_locale=en'
details_endpoint_response = requests.get(details_endpoint, headers=headers, cookies=cookies).json()
designName = item['name']
screenName = item['user']['screenName']
fabric_name = details_endpoint_response['data']['fabric_code']
try:
test_swatch_meter = details_endpoint_response['data']['pricing']['TEST_SWATCH_METER']['price']
except:
test_swatch_meter = 'N/A'
try:
fat_quarter_meter = details_endpoint_response['data']['pricing']['FAT_QUARTER_METER']['price']
except:
fat_quarter_meter = 'N/A'
try:
meter = details_endpoint_response['data']['pricing']['METER']['price']
except:
meter = 'N/A'
if (designName, screenName) not in items_dict.keys():
items_dict[(designName, screenName)] = {}
itemCount = len(items_dict[(designName, screenName)].values()) / 4
items_dict[(designName, screenName)].update({'fabric_name_%02d' %itemCount: fabric_name,
'test_swatch_meter_%02d' %itemCount: test_swatch_meter,
'fat_quarter_meter_%02d' %itemCount: fat_quarter_meter,
'meter_%02d' %itemCount: meter})
print(designName, screenName, fabric_name, test_swatch_meter,fat_quarter_meter, meter)
df = pd.DataFrame.from_dict(items_dict, orient='index').reset_index(drop=False)
df = df.rename(columns={'level_0':'designName','level_1':'screenName'})
df.to_csv('scraped_data.csv', index=False)

Related

Not able to extract data or find problem with code

Below is the code for extracting news articles for different companies from Google news. This gives me an empty excel file with only the headers. I am not able to figure out what the error is. Can someone please help out. (The entire code can be copy-pasted, and reproduced in your machine):
import requests
import random
from collections import OrderedDict
def list_header():
headers_list = [
# Firefox 24 Linux
{
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
},
# Firefox Mac
{
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}]
return headers_list
def list_dict():
# Get headers list
headers_list = list_header()
# Create ordered dict from Headers above
ordered_headers_list = []
for headers in headers_list:
h = OrderedDict()
for header,value in headers.items():
h[header]=value
ordered_headers_list.append(h)
return ordered_headers_list
def list_test():
headers_list = list_dict()
max = len(headers_list)
url = 'https://httpbin.org/headers'
for i in range(0,max):
#Pick a random browser headers
headers = random.choice(headers_list)
#Create a request session
r = requests.Session()
r.headers = headers
response = r.get(url)
print("Request #%d\nUser-Agent Sent:%s\n\nHeaders Recevied by HTTPBin:"%(i,headers['User-Agent']))
print(response.json())
print("-------------------")
def random_header():
headers_list = list_dict()
headers = random.choice(headers_list)
return headers
import pandas as pd
def ingest_google_news():
ticker_list = ['AAPL.O', 'MSFT', 'GOOG', '2222.SR', 'AMZN', 'FB', 'TSLA', 'BRK-A', 'TCEHY', 'TSM', 'NVDA', 'JPM', 'BABA', 'V', 'JNJ', '005930.KS', 'WMT', 'LVMUY']
sep = '.'
df = pd.DataFrame()
t_news = []
t_publisher = []
t_urls = []
t_dates = []
t_tickers = []
for t in ticker_list:
news = []
publisher = []
urls = []
dates = []
tickers = []
# cleaning ticker
ticker = t
t = t.split(sep, 1)[0]
# set header by random user agent
r = requests.Session()
headers = random_header()
r.headers = headers
# print(headers)
# set query for google
query = '{} news'.format(t)
url = f"https://www.google.com/search?q={query}&tbm=nws&lr=lang_en&hl=en&sort=date&num=5"
res = r.get(url, headers=headers)
soup = bs4.BeautifulSoup(res.text, "html.parser")
links = soup.select(".dbsr a")
for l in links:
tickers.append(t)
try:
url_w = l.get("href")
print(url_w)
urls.append(url_w)
dt = find_date(url_w)
dates.append(dt)
res = requests.get(url_w, headers=headers)
parsed_article = bs4.BeautifulSoup(res.text,'lxml')
paragraphs = parsed_article.find_all('p')
article_text = ""
for p in paragraphs:
article_text += p.text
except Exception as e:
article_text = ''
news.append(article_text)
sources = soup.select(".XTjFC g-img")
for s in sources:
publisher.append(s.next_sibling.lower())
t_urls += urls
t_news += news
t_publisher += publisher
t_dates += dates
t_tickers += tickers
df['ticker'] = t_tickers
df['links'] = t_urls
df['article_text'] = t_news
df['publisher'] = t_publisher
df['created_at'] = t_dates
# import to csv
today = datetime.date.today()
d1 = today.strftime("%d%m%Y")
df.to_csv(f'/content/drive/MyDrive/google_news_{d1}.csv')
del news, publisher, urls, dates, tickers
del t_news, t_publisher, t_urls, t_dates, t_tickers
import bs4
from bs4 import BeautifulSoup
import datetime
ingest_google_news()
The code above is from the following link: https://medium.com/analytics-vidhya/google-scraping-using-beautifulsoup-d53746ef5a32

Have a look at the SelectorGadget Chrome extension to grab CSS selectors by clicking on the desired element in your browser. CSS selectors reference.
Code and example in the online IDE (extracts the title, link, snippet, date published, source, and stores to CSV):
from bs4 import BeautifulSoup
import requests, lxml
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
ticker_list = ['AAPL.O', 'LVS', 'COTY.K','JPM', 'XOM', '005930.KS']
def get_news():
# iterate over each ticker
for news in ticker_list:
params = {
"q": news, # query
"hl": "en", # language
"gl": "us", # country to search from
"tbm": "nws", # google news filter
}
# store news data
news_data = []
html = requests.get('https://www.google.com/search', headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
# container with needed data
for result in soup.select('.WlydOe'):
title = result.select_one('.nDgy9d').text
link = result['href']
source = result.select_one('.CEMjEf span').text
snippet = result.select_one('.GI74Re').text
date_published = result.select_one('p.S1FAPd').text
print(f'{title}\n{link}\n{snippet}\n{date_published}\n{source}\n')
# append news data to list in dict() format to save it later via pandas
news_data.append({
'title': title,
'link': link,
'snippet': snippet,
'date_published': date_published,
'source': source
})
# create DataFrame from the list()
df = pd.DataFrame(news_data)
# save DataFrame to csv without default index column on the left side
df.to_csv('bs4_final.csv', index=False)
get_news()
----------
'''
EU plans to legislate for common phone charger despite ...
https://www.reuters.com/technology/eu-plans-legislate-common-phone-charger-despite-apple-grumbles-2021-09-21/
... a person familiar with the matter said - a move likely to affect iPhone
maker Apple (AAPL.O) more than its rivals.
2 hours ago
Reuters
Wall Street ends sharply lower in broad sell-off
https://www.reuters.com/business/wall-street-ends-sharply-lower-broad-sell-off-2021-09-20/
O), Amazon.com Inc (AMZN.O), Apple Inc (AAPL.O), Facebook Inc (FB. ... O)
were among the biggest drags on the index as well as the S&P 500.
18 hours ago
Reuters
... other results
'''
Alternatively, you can achieve the same thing by using Google News Results API from SerpApi. It's a paid API with a free plan.
The main difference in your example is that you don't need to make things that complicated, figure out why things just don't work as expected, and then maintain it over time. Instead, you only need to iterate over structured JSON and get what you want, fast.
Code to integrate (same process for saving to csv as with code above):
import os
from serpapi import GoogleSearch
ticker_list = ['AAPL.O', 'LVS', 'COTY.K','JPM', 'XOM', '005930.KS']
def get_news():
for news in ticker_list:
params = {
"engine": "google",
"q": news,
"gl": "us",
"tbm": "nws",
"api_key": os.getenv("API_KEY"), # API environment variable
}
search = GoogleSearch(params)
results = search.get_dict()
for news_result in results["news_results"]:
print(f"Title: {news_result['title']}\nLink: {news_result['link']}\n")
get_news()
-----------
'''
Title: EU plans to legislate for common phone charger despite ...
Link: https://www.reuters.com/technology/eu-plans-legislate-common-phone-charger-despite-apple-grumbles-2021-09-21/
Title: Wall Street ends sharply lower in broad sell-off
Link: https://www.reuters.com/business/wall-street-ends-sharply-lower-broad-sell-off-2021-09-20/
Title: S&P 500 down more than 2% as growth worries rise
Link: https://www.reuters.com/business/sp-500-down-more-than-2-growth-worries-rise-2021-09-20/
Title: Apple joins streaming elite, Netflix crosses milestone with ...
Link: https://www.reuters.com/technology/apple-joins-streaming-elite-netflix-crosses-milestone-with-emmy-wins-2021-09-20/
... other results
'''
Disclaimer, I work for SerpApi.

python TypeError: 'int' object is not iterable in beautifulsoup library

There is this site called dnsdumpster that provides all the sub-domains for a domain. I am trying to automate this process and print out a list of the subdomains. Each individual sub-domain is within the "td" HTML tag. I am trying to iterate through all these tags and print out the sub-domains, but I get an error.
import requests
import re
from bs4 import BeautifulSoup
headers = {
'Host' : 'dnsdumpster.com',
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language' : 'en-US,en;q=0.5',
'Accept-Encoding' : 'gzip, deflate',
'DNT' : '1',
'Upgrade-Insecure-Requests' : '1',
'Referer' : 'https://dnsdumpster.com/',
'Connection' : 'close'
}
proxies = {
'http' : 'http://127.0.0.1:8080'
}
domain = 'google.com'
with requests.Session() as s:
url = 'https://dnsdumpster.com'
response = s.get(url, headers=headers, proxies=proxies)
response.encoding = 'utf-8' # Optional: requests infers this internally
soup1 = BeautifulSoup(response.text, 'html.parser')
input = soup1.find_all('input')
csrfmiddlewaretoken_raw = str(input[0])
csrfmiddlewaretoken = csrfmiddlewaretoken_raw[55:119]
data = {
'csrfmiddlewaretoken' : csrfmiddlewaretoken,
'targetip' : domain
}
send_data = s.post(url, data=data, proxies=proxies, headers=headers)
print(send_data.status_code)
soup2 = BeautifulSoup(send_data.text, 'html.parser')
td = soup2.find_all('td')
for i in len(td):
item = str(td[i])
subdomain = item[21:37]
print(subdomain)
Error looks like this:
Traceback (most recent call last): File "dns_dumpster_4.py", line
39, in
for i in len(td): TypeError: 'int' object is not iterable
And once the above error is solve, I would also need help with another question:
How can I use a regular expression to get the individual sub-domain from within this "td" tag, because the contents of this tag is very long and messy and I only need the subdomain. I would really appreciate it, if some could help me with a simple get the sub-domain name only.

I try to catch subdomain with out using regex.
import requests
from bs4 import BeautifulSoup
headers = {
'Host' : 'dnsdumpster.com',
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language' : 'en-US,en;q=0.5',
'Accept-Encoding' : 'gzip, deflate',
'DNT' : '1',
'Upgrade-Insecure-Requests' : '1',
'Referer' : 'https://dnsdumpster.com/',
'Connection' : 'close'
}
proxies = {
'http' : 'http://127.0.0.1:8080'
}
domain = 'google.com'
with requests.Session() as s:
url = 'https://dnsdumpster.com'
response = s.get(url, headers=headers, proxies=proxies)
response.encoding = 'utf-8' # Optional: requests infers this internally
soup1 = BeautifulSoup(response.text, 'html.parser')
input = soup1.find_all('input')
csrfmiddlewaretoken_raw = str(input[0])
csrfmiddlewaretoken = csrfmiddlewaretoken_raw[55:119]
data = {
'csrfmiddlewaretoken' : csrfmiddlewaretoken,
'targetip' : domain
}
send_data = s.post(url, data=data, proxies=proxies, headers=headers)
print(send_data.status_code)
soup2 = BeautifulSoup(send_data.text, 'html.parser')
td = soup2.find_all('td', {'class': 'col-md-3'})
# for dom in range(0, len(td),2):
# print(td[dom].get_text(strip=True, separator='\n'))
mysubdomain = []
for dom in range( len(td)):
# print(td[dom].get_text(strip=True, separator='\n'))
if '.' in td[dom].get_text(strip=True):
x = td[dom].get_text(strip=True, separator=',').split(',')
mysubdomain.append(x)
# print(x)
# y = td[dom].get_text(strip=True, separator=',').split(',')[1]
# mysubdomain.append(td[dom].get_text(strip=True, separator=','))
print(mysubdomain)
# print(td)
# for i in range(len(td)):
# item = str(td[i])
# print('\n', item, '\n')
# subdomain = item[21:37]
# print(subdomain)
from functools import reduce
flat_list_of_mysubdomain = reduce(lambda x, y: x + y, mysubdomain)
print(flat_list_of_mysubdomain)
I hope its help you.

Cannot overwrite CSV file in Python

I have already searched for an answer here and spent a long time on google, but nothing...
I've already tried opening the file with 'w' instead of 'r' or 'a' but I still can't get my code to overwrite the current results I have originally written to the CSV file. I'm basically scraping information from a website and I want to first search for a term, scrape that data, save it to the CSV file AND THEN, search for another term and scrape that data and overwrite the current CSV file with the new data.
#!/usr/bin/python3
#from pyvirtualdisplay import Display
import csv
from bs4 import BeautifulSoup
import urllib.request
def getPageSource(current_page):
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
req = urllib.request.Request(current_page, headers=hdr)
page = urllib.request.urlopen(req)
soup = BeautifulSoup(page, "html5lib")
return(soup)
def get_length(file_path):
with open("data.csv", 'r', encoding='utf8') as csvfile:
reader = csv.reader(csvfile)
reader_list = list(reader)
return len(reader_list)
def write_data(file_path, company_name, role, full_url, date):
fieldnames = ['ID', 'Company','Role', 'URL', 'Date']
next_id = get_length(file_path)
with open(file_path, "w", encoding='utf8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#writer.writeheader()
writer.writerow({
"ID": next_id,
"Company": company_name,
"Role": role,
"URL": full_url,
"Date": date
})
csvfile.close()
def find_data(source):
base_url = 'https://www.irishjobs.ie'
for a in source.find_all(attrs={"itemtype" : "https://schema.org/JobPosting"}):
job_info = a.find('h2').find('a')
company_name = a.find('h3').find('a').get_text()
url = job_info['href']
full_url = (base_url + url)
role = (job_info.get_text())
date = a.find('li',class_='updated-time').get_text().replace('Updated','').strip()
write_data("data.csv", company_name, role, full_url, date)
if __name__ == '__main__':
query = input('Enter role to search: ')
source = getPageSource('https://www.irishjobs.ie/ShowResults.aspx?Keywords='+query+'&Location=102&Category=3&Recruiter=All&SortBy=MostRecent&PerPage=100')
find_data(source)

You need to keep the file open until you have finished writing it. Also, it is easier to keep a count of the rows written (using enumerate()) than to keep trying to read the file back in:
import csv
from bs4 import BeautifulSoup
import urllib.request
def getPageSource(current_page):
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
req = urllib.request.Request(current_page, headers=hdr)
page = urllib.request.urlopen(req)
return (BeautifulSoup(page, "html5lib"))
def find_data(source):
base_url = 'https://www.irishjobs.ie'
fieldnames = ['ID', 'Company','Role', 'URL', 'Date']
with open('data.csv', 'w', encoding='utf8', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(fieldnames)
for id, a in enumerate(source.find_all(attrs={"itemtype" : "https://schema.org/JobPosting"}), start=1):
job_info = a.find('h2').find('a')
company_name = a.find('h3').find('a').get_text()
url = job_info['href']
full_url = (base_url + url)
role = (job_info.get_text())
date = a.find('li',class_='updated-time').get_text().replace('Updated','').strip()
writer.writerow([id, company_name, role, full_url, date])
if __name__ == '__main__':
query = input('Enter role to search: ')
source = getPageSource('https://www.irishjobs.ie/ShowResults.aspx?Keywords='+query+'&Location=102&Category=3&Recruiter=All&SortBy=MostRecent&PerPage=100')
find_data(source)
This would give you data.csv starting:
ID,Company,Role,URL,Date
1,Computer Futures,Xamarin Developer,https://www.irishjobs.ie/Jobs/Xamarin-Developer-8143810.aspx,06/03/2018
2,Wallace Myers International,New Business Development Manager,https://www.irishjobs.ie/Jobs/New-Business-Development-Manager-8143989.aspx,06/03/2018
3,Reperio Human Capital Ltd,Senior Software Developer - Dublin,https://www.irishjobs.ie/Jobs/Senior-Software-Developer-Dublin-8150128.aspx,20/03/2018
In your case, it is probably easier to just use a plain csv.writer() rather than a Dictwriter().

Reading data from a website passing parameters

import requests
from lxml import html
from bs4 import BeautifulSoup
session_requests = requests.session()
sw_url = "https://www.southwest.com"
sw_url2 = "https://www.southwest.com/flight/select-flight.html?displayOnly=&int=HOMEQBOMAIR"
#result = session_requests.get(sw_url)
#tree = html.fromstring(result.text)
payload = {"name":"AirFormModel","origin":"MCI","destination":"DAL","departDate":"2018-02-28T06:00:00.000Z","returnDate":"2018-03-03T06:00:00.000Z","tripType":"true","priceType":"DOLLARS","adult":1,"senior":0,"promoCode":""}
#{
# 'origin': 'MCI',
# 'destination': 'DAL',
# 'departDate':'2018-02-28T06:00:00.000Z',
# 'returnDate':'2018-03-01T06:00:00.000Z',
# 'adult':'1'
#}
p = requests.post(sw_url,params=payload)
#print(p.text)
print(p.content)
p1 = requests.get(sw_url2)
soup = BeautifulSoup(p.text,'html.parser')
print(soup.find("div",{"class":"productPricing"}))
pr = soup.find_all("span",{"class":"currency_symbol"})
for tag in pr:
print(tag)
print('++++')
print(tag.next_sibling)
print(soup.find("div",{"class":"twoSegments"}))
soup = BeautifulSoup(p1.text,'html.parser')
print(soup.find("div",{"class":"productPricing"}))
pr = soup.find_all("span",{"class":"currency_symbol"})
for tag in pr:
print(tag)
print('++++')
print(tag.next_sibling)
print(soup.find("div",{"class":"twoSegments"}))
I need to retrieve prices for flights between 2 locations on specific dates. I identified the parameters by looking at the session info from inspector of the browser and included them in the post request.
I am not sure what I'm doing wrong here, but I am unable to read the data from the tags correctly. It's printing none.
Edit : 4/25/2018
I'm using the following code now, but it doesn't seem to help. Please advise.
import threading
from lxml import html
from bs4 import BeautifulSoup
import time
import datetime
import requests
def worker(oa,da,ods):
"""thread worker function"""
print (oa + ' ' + da + ' ' + ods + ' ' + str(datetime.datetime.now()))
url = "https://www.southwest.com/api/air-booking/v1/air-booking/page/air/booking/shopping"
rh = {
'accept': 'application/json,text/javascript,*/*;q=0.01',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.5',
'cache-control': 'max-age=0',
'content-length': '454',
'content-type': 'application/json',
'referer': 'https://www.southwest.com/air/booking/select.html?originationAirportCode=MCI&destinationAirportCode=LAS&returnAirportCode=&departureDate=2018-05-29&departureTimeOfDay=ALL_DAY&returnDate=&returnTimeOfDay=ALL_DAY&adultPassengersCount=1&seniorPassengersCount=0&fareType=USD&passengerType=ADULT&tripType=oneway&promoCode=&reset=true&redirectToVision=true&int=HOMEQBOMAIR&leapfrogRequest=true',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
fd = {
'returnAirport':'',
'twoWayTrip':'false',
'fareType':'DOLLARS',
'originAirport':oa,
'destinationAirport':da,
'outboundDateString':ods,
'returnDateString':'',
'adultPassengerCount':'1',
'seniorPassengerCount':'0',
'promoCode':'',
'submitButton':'true'
}
with requests.Session() as s:
r = s.post(url,headers = rh )
# soup = BeautifulSoup(r.content,'html.parser')
# soup = BeautifulSoup(r.content,'lxml')
print(r)
print(r.content)
print (oa + ' ' + da + ' ' + ods + ' ' + str(datetime.datetime.now()))
return
#db = MySQLdb.connect(host="localhost",user="root",passwd="vikram",db="garmin")
rcount = 0
tdelta = 55
#print(strt_date)
threads = []
count = 1
thr_max = 2
r = ["MCI","DEN","MCI","MDW","MCI","DAL"]
strt_date = (datetime.date.today() + datetime.timedelta(days=tdelta)).strftime("%m/%d/%Y")
while count < 2:
t = threading.Thread(name=r[count-1]+r[count],target=worker,args=(r[count-1],r[count],strt_date))
threads.append(t)
t.start()
count = count + 2

When you say looked at the session info from inspector of the browser, I'm assuming you meant the network tab. If that's the case, are you sure you noted the data being sent properly?
Here's the URL that gets sent by the browser, following which the page you required is fetched:
url = 'https://www.southwest.com/flight/search-flight.html'
You didn't use headers in your request, which, in my opinion, should be passed compulsorily in some cases. Here are the headers that the browser passes:
:authority:www.southwest.com
:method:POST
:path:/flight/search-flight.html
:scheme:https
accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
accept-encoding:gzip, deflate, br
accept-language:en-US,en;q=0.9
cache-control:max-age=0
content-length:564
content-type:application/x-www-form-urlencoded
origin:https://www.southwest.com
referer:https://www.southwest.com/flight/search-flight.html?int=HOMEQBOMAIR
upgrade-insecure-requests:1
user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36
Note:
I removed the cookie header, because that would be taken care of by requests if you're using session.
The first four headers (those that begin with a colon (':')) cannot be passed in Python's requests; so, I skipped them.
Here's the dict that I used to pass the headers:
rh = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'content-length': '564',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://www.southwest.com',
'referer': 'https://www.southwest.com/flight/search-flight.html?int=HOMEQBOMAIR',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
}
And here is the form data sent by browser:
fd = {
'toggle_selfltnew': '',
'toggle_AggressiveDrawers': '',
'transitionalAwardSelected': 'false',
'twoWayTrip': 'true',
'originAirport': 'MCI',
# 'originAirport_displayed': 'Kansas City, MO - MCI',
'destinationAirport': 'DAL',
# 'destinationAirport_displayed': 'Dallas (Love Field), TX - DAL',
'airTranRedirect': '',
'returnAirport': 'RoundTrip',
'returnAirport_displayed': '',
'outboundDateString': '02/28/2018',
'outboundTimeOfDay': 'ANYTIME',
'returnDateString': '03/01/2018',
'returnTimeOfDay': 'ANYTIME',
'adultPassengerCount': '1',
'seniorPassengerCount': '0',
'promoCode': '',
'fareType': 'DOLLARS',
'awardCertificateToggleSelected': 'false',
'awardCertificateProductId': ''
}
Note that I commented out two of the items above, but it didn't make any difference. I assumed you'd be having only the location codes and not the full name. If you do have them or if you can extract them from the page, you can send those as well along with other data.
I don't know if it makes any difference, but I used data instead of params:
with requests.Session() as s:
r = s.post(url, headers = rh, data = fd)
soup = BeautifulSoup(r.content, 'lxml')
Finally, here is the result:
>>> soup.find('span', {'class': 'currency_symbol'}).text
'$'

How to return only today and yesterday's information that published using POST requests

I need to get the information that published today and a day before. Also when importing it to a csv file it only print the first column not the remained ones.
The URL: https://e-mehkeme.gov.az/Public/Cases
The dates stored in html as <td style="width:95px;text-align:center">28.10.2019</td>
import requests, re
from bs4 import BeautifulSoup as bs
import csv
request_headers = {
'authority': 'e-mehkeme.gov.az',
'method': 'POST',
'path': '/Public/Cases',
'scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
'application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en,en-GB;q=0.9',
'cache-control': 'max-age=0',
'content-length': '66',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://e-mehkeme.gov.az',
'referer': 'https://e-mehkeme.gov.az/Public/Cases',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/75.0.3770.142 Safari/537.36',
}
voens = {'3100608381',
}
form_data = {
'CourtId': '',
'CaseNo': '',
'DocFin': '',
'DocSeries': '',
'DocNumber': '',
'VOEN': voens,
'button': 'Search',
}
url = 'https://e-mehkeme.gov.az/Public/Cases?courtid='
response = requests.post(url, data=form_data, headers=request_headers)
s = bs(response.content, 'lxml')
# PRINT THE CONTENTS OF EACH SEARCH!
for voen in voens:
form_data['VOEN'] = voen
r = requests.post('https://e-mehkeme.gov.az/Public/Cases', data=form_data)
soup = bs(r.text, 'lxml')
ids = [i['value'] for i in soup.select('.casedetail')]
for i in ids:
r = requests.get(f'https://e-mehkeme.gov.az/Public/CaseDetail?caseId={i}')
soup = bs(r.content, 'lxml')
output = [re.sub('\s+', ' ', i.text.strip()) for i in soup.select('[colspan="4"]')]
print(output)
with open('courtSearch.csv', 'w', newline='', encoding='utf-8') as myfile:
writer = csv.writer(myfile, quoting=csv.QUOTE_ALL)
writer.writerow(output)
DESIRED OUTPUT:

The following uses a slightly different url construct so you can use a GET request and easily gather all pages of results per voen. I gather the string dates and caseIds (required for later requests) during each request. I then use a mask (for days of interest e.g. today and yesterday, converted to strings of same format as on website) to filter for only the ids within desired date range. I then loop that filtered list and issue requests for the pop-up window info.
Within the code you can also see commented out sections. One of which shows you the results retrieved from each page
#print(pd.read_html(str(soup.select_one('#Cases')))[0]) ##view table
I am splitting on the header phrases (so assuming these are regular) such that I can split each string from row into the appropriate output columns.
Possiby requires bs4 4.7.1 +
import requests,re, csv
from bs4 import BeautifulSoup as bs
from datetime import datetime, timedelta
import pandas as pd
headers = ['Ətraflı məlumat: ', 'Cavabdeh: ', 'İddiaçı: ', 'İşin mahiyyəti ']
voens = ['2002283071','1303450301', '1700393071']
number_of_past_days_plus_today = 2
mask = [datetime.strftime(datetime.now() - timedelta(day_no), '%d.%m.%Y') for day_no in range(0, number_of_past_days_plus_today)]
ids = []
table_dates = []
with requests.Session() as s:
for voen in voens:
#print(voen) ##view voen
page = 1
while True:
r = s.get(f'https://e-mehkeme.gov.az/Public/Cases?page={page}&voen={voen}') #to get all pages of results
soup = bs(r.text, 'lxml')
ids.extend([i['value'] for i in soup.select('.casedetail')])
#print(pd.read_html(str(soup.select_one('#Cases')))[0]) ##view table
table_dates.extend([i.text.strip() for i in soup.select('#Cases td:nth-child(2):not([colspan])')])
if soup.select_one('[rel=next]') is None:
break
page+=1
pairs = list(zip(table_dates,ids))
filtered = [i for i in pairs if i[0] in mask]
#print(100*'-') ##spacing
#print(filtered) ##view final filtered list of ids
results = []
for j in filtered:
r = s.get(f'https://e-mehkeme.gov.az/Public/CaseDetail?caseId={j[1]}')
soup = bs(r.content, 'lxml')
line = ' '.join([re.sub('\s+',' ',i.text.strip()) for i in soup.select('[colspan="4"]')])
row = re.split('|'.join(headers),line)
results.append(row[1:])
with open("results.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ",", quoting=csv.QUOTE_MINIMAL)
w.writerow(headers)
for row in results:
w.writerow(row)
I searched for splitting on multiple delimiters and used the idea given by #Jonathan here. So upvoted for credit to that user.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Formatting scraped data Python Beautifulsoup - python

Related

Not able to extract data or find problem with code

python TypeError: 'int' object is not iterable in beautifulsoup library

Cannot overwrite CSV file in Python

Reading data from a website passing parameters

How to return only today and yesterday's information that published using POST requests

Categories

Resources