webscrape and splitting of retrieved data into different lines

webscrape and splitting of retrieved data into different lines - python

I am trying to collect the event date, time and venue. They came out successfully but then it is not reader friendly. How do I get the date, time and venue to appear separately like:
- event
Date:
Time:
Venue:
- event
Date:
Time:
Venue:
I was thinking of splitting but I ended up with lots of [ ] which made it looked even uglier. I thought of stripping but my regular expression but it does not appear to do anything. Any suggestions?
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
url_toscrape = "https://www.ntu.edu.sg/events/Pages/default.aspx"
response = urllib.request.urlopen(url_toscrape)
info_type = response.info()
responseData = response.read()
soup = BeautifulSoup(responseData, 'lxml')
events_absFirst = soup.find_all("div",{"class": "ntu_event_summary_title_first"})
date_absAll = tr.find_all("div",{"class": "ntu_event_summary_date"})
events_absAll = tr.find_all("div",{"class": "ntu_event_summary_title"})
for first in events_absFirst:
print('-',first.text.strip())
print (' ',date)
for tr in soup.find_all("div",{"class":"ntu_event_detail"}):
date_absAll = tr.find_all("div",{"class": "ntu_event_summary_date"})
events_absAll = tr.find_all("div",{"class": "ntu_event_summary_title"})
for events in events_absAll:
events = events.text.strip()
for date in date_absAll:
date = date.text.strip('^Time.*')
print ('-',events)
print (' ',date)

You can iterate over the divs containing the event information, store the results, and then print each:
import requests, re
from bs4 import BeautifulSoup as soup
d = soup(requests.get('https://www.ntu.edu.sg/events/Pages/default.aspx').text, 'html.parser')
results = [[getattr(i.find('div', {'class':re.compile('ntu_event_summary_title_first|ntu_event_summary_title')}), 'text', 'N/A'), getattr(i.find('div', {'class':'ntu_event_summary_detail'}), 'text', 'N/A')] for i in d.find_all('div', {'class':'ntu_event_articles'})]
new_results = [[a, re.findall('Date : .*?(?=\sTime)|Time : .*?(?=Venue)|Time : .*?(?=$)|Venue: [\w\W]+', b)] for a, b in results]
print('\n\n'.join('-{}\n{}'.format(a, '\n'.join(f' {h}:{i}' for h, i in zip(['Date', 'Time', 'Venue'], b))) for a, b in new_results))
Output:
-7th ASEF Rectors' Conference and Students' Forum (ARC7)
Date:Date : 29 Nov 2018 to 14 May 2019
Time:Time : 9:00am to 5:00pm
-Be a Youth Corps Leader
Date:Date : 1 Dec 2018 to 31 Mar 2019
Time:Time : 9:00am to 5:00pm
-NIE Visiting Artist Programme January 2019
Date:Date : 14 Jan 2019 to 11 Apr 2019
Time:Time : 9:00am to 8:00pm
Venue:Venue: NIE Art gallery
-Exercise Classes for You: Healthy Campus#NTU
Date:Date : 21 Jan 2019 to 18 Apr 2019
Time:Time : 6:00pm to 7:00pm
Venue:Venue: The Wave # Sports & Recreation Centre
-[eLearning Course] Information & Media Literacy (From January 2019)
Date:Date : 23 Jan 2019 to 31 May 2019
Time:Time : 9:00am to 5:00pm
Venue:Venue: NTULearn
...

You could use requests and test the length of stripped_strings
import requests
from bs4 import BeautifulSoup
import pandas as pd
url_toscrape = "https://www.ntu.edu.sg/events/Pages/default.aspx"
response = requests.get(url_toscrape)
soup = BeautifulSoup(response.content, 'lxml')
events = [item.text for item in soup.select("[class^='ntu_event_summary_title']")]
data = soup.select('.ntu_event_summary_date')
dates = []
times = []
venues = []
for item in data:
strings = [string for string in item.stripped_strings]
if len(strings) == 3:
dates.append(strings[0])
times.append(strings[1])
venues.append(strings[2])
elif len(strings) == 2:
dates.append(strings[0])
times.append(strings[1])
venues.append('N/A')
elif len(strings) == 1:
dates.append(strings[0])
times.append('N/A')
venues.append('N/A')
results = list(zip(events, dates, times, venues))
df = pd.DataFrame(results)
print(df)

Related

Extract elements between two tags with Beautiful Soup and Python

I want to crawl this website http://www.truellikon.ch/freizeit-kultur/anlaesse-agenda.html .
I want to extract date and time of each event.
You can see that date is listed above events. In order to extract date and time I need to combine different divs, but the problem is that I do not have 'container' for group of events that are on the same date.
So the only thing that I can do is to extract all events that are between two divs that refer to date.
This is the code for extracting the event info:
from bs4 import BeautifulSoup
import requests
domain = 'truellikon.ch'
url = 'http://www.truellikon.ch/freizeit-kultur/anlaesse-agenda.html'
def get_website_news_links_truellikonCh():
response = requests.get(url, allow_redirects=True)
print("Response for", url, response)
soup = BeautifulSoup(response.content, 'html.parser')
all_events = soup.select('div.eventItem')
for i in all_events:
print(i)
print()
input()
x = get_website_news_links_truellikonCh()
Class name for date is 'listThumbnailMonthName'
My question is how can I combine these divs, how can I write the selectors so that I can get exact date and time, title and body of each event

you have one parent container which is #tx_nezzoagenda_list and then you have to read the children one by one
import re
from bs4 import BeautifulSoup
import requests
url = 'http://www.truellikon.ch/freizeit-kultur/anlaesse-agenda.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
container = soup.select_one('#tx_nezzoagenda_list')
for child in container.children:
if not child.name:
continue
if 'listThumbnailMonthName' in child.get('class'):
base_date=child.text.strip()
else:
day=child.select_one('.dateDayNumber').text.strip()
title=child.select_one('.titleText').text.strip()
locationDate=child.select_one('.locationDateText').children
time=list(locationDate)[-1].strip()
time=re.sub('\s','', time)
print(title, day, base_date, time)
which outputs
Abendunterhaltung TV Trüllikon 10 Dezember 2021 19:00Uhr-3:00Uhr
Christbaum-Verkauf 18 Dezember 2021 9:30Uhr-11:00Uhr
Silvester Party 31 Dezember 2021 22:00Uhr
Neujahrsapéro 02 Januar 2022 16:00Uhr-18:00Uhr
Senioren-Zmittag 21 Januar 2022 12:00Uhr-15:00Uhr
Theatergruppe "Nume Hüür", Aufführung 23 Januar 2022 13:00Uhr-16:00Uhr
Elektroschrottsammlung 29 Januar 2022 9:00Uhr-12:00Uhr
Senioren Z'mittag 18 Februar 2022 12:00Uhr-15:00Uhr
Frühlingskonzert 10 April 2022 12:17Uhr
Weinländer Musiktag 22 Mai 2022 8:00Uhr
Auffahrtskonzert Altersheim 26 Mai 2022 10:30Uhr
Feierabendmusik und Jubilarenehrung 01 Juli 2022 19:00Uhr
Feierabendmusik 15 Juli 2022 12:24Uhr
Feierabendmusik 19 August 2022 19:00Uhr
Herbstanlass 19 November 2022 20:00Uhr

Scrape information about iXBRL link inside source code with python Beautiful Soup

First of all, here is page i want to scrape : https://find-and-update.company-information.service.gov.uk/company/09382107/filing-history
I have a python script that currently gets me all the iXBRL files and save them inside my folder. I would like to get the informations about those files : "Date" and "Description" as string, my problem is that i only want to get the "Date" and "Description" of the file with iXBRL link.
This is what I have so far:
link_filling_historic = "https://find-and-update.company-information.service.gov.uk/company/09382107/filing-history"
r = requests.get(link_filling_historic)
html = r.text
soup = BeautifulSoup(html, "html.parser")
info = soup.find('table', {'class': 'full-width-table'})
info = soup.findChildren(['tr'])
info_of_iXBRL_files = []
other_info = []
for item in info:
if "xhtml" in item:
info_of_iXBRL_files.append(item)
else :
other_info.append(item)
print(info_of_iXBRL_files)
output :
[]
My idea was to keep the item from my list with "xhtml" inside it and remove the other to be able then to extract easely the text from each item.
The output is an empty list meaning that he doesn't recognize any "xhtml" string in the item of my list. I don't get why
Any suggestions on how I might get this to work? Thanks in advance.

import requests
from bs4 import BeautifulSoup
URL = 'https://find-and-update.company-information.service.gov.uk/company/09382107/filing-history'
page = requests.get(URL)
soup = BeautifulSoup(page.content)
table_div = soup.find('table',class_='full-width-table')
rows = table_div.find_all('tr')
table = {}
row_index = 1
for tr in rows:
tds = tr.find_all('td')
ls = []
td_index = 0
for td in tds:
text = td.get_text().strip()
if td_index != 1 and td_index != 3:
ls.append(text)
if td_index == 3:
#a = td.find_all('a',string="Download iXBRL",href=True)
#if len(a) > 0:
#for tag in a:
#href = tag['href']
a = td.select_one('a',string="Download iXBRL",href=True)
if a:
href = a['href']
ls.append(href)
table[row_index] = ls
row_index += 1
td_index += 1
print(table)
Output
{1: ['04 Mar 2021',
'Micro company accounts made up to 31 December 2019',
'/company/09382107/filing-history/MzI5MzM0OTU0OGFkaXF6a2N4/document?format=xhtml&download=1'],
2: ['04 Mar 2021',
'Micro company accounts made up to 31 December 2018',
'/company/09382107/filing-history/MzI5MzMzNTIwM2FkaXF6a2N4/document?format=xhtml&download=1'],
3: ['09 Nov 2018',
'Accounts for a dormant company made up to 31 December 2017',
'/company/09382107/filing-history/MzIxOTA5MTA2N2FkaXF6a2N4/document?format=xhtml&download=1'],
4: ['06 Dec 2017',
'Accounts for a dormant company made up to 31 January 2017',
'/company/09382107/filing-history/MzE5MjEyNzU2M2FkaXF6a2N4/document?format=xhtml&download=1'],
5: ['04 Nov 2016',
'Accounts for a dormant company made up to 31 January 2016',
'/company/09382107/filing-history/MzE2MTE5NTk3NWFkaXF6a2N4/document?format=xhtml&download=1']}
In this table there is an td without required in your output with index 1

Returning table information based on a condition in Beautiful Soup/Python

I'm trying to scrape this page: https://www.nysenate.gov/legislation/bills/2019/s8450
I only want to pull information from the table (the one that appears when you click "view actions") If it contains the following string: "Delivered To Governor".
I can iterate through the table, but then I have trouble trying to strip away all the extra tag-text.
url = "https://www.nysenate.gov/legislation/bills/2019/s8450"
raw_html = requests.get(url).content
soup = BeautifulSoup(raw_html, "html.parser")
bill_life_cycle_table = soup.find("tbody")
bill_life_cycle_table

you can provide if condition to check if string is present in the cell and find the previous cell value. Use css selector select()
from bs4 import BeautifulSoup
import requests
url = "https://www.nysenate.gov/legislation/bills/2019/s8450"
raw_html = requests.get(url).content
soup = BeautifulSoup(raw_html, "html.parser")
tablebody=soup.select_one(".table.c-bill--actions-table > tbody")
for item in tablebody.select("td"):
if "delivered to governor" in item.text:
print(item.find_previous("td").text)
Console output:
Dec 11, 2020

Use the bs4.element.Tag.text method:
from bs4 import BeautifulSoup
import requests
url = "https://www.nysenate.gov/legislation/bills/2019/s8450"
raw_html = requests.get(url).content
soup = BeautifulSoup(raw_html, "html.parser")
bill_life_cycle_table = soup.find("tbody")
print(bill_life_cycle_table.text)
Output:
Dec 11, 2020
delivered to governor
Jul 23, 2020
returned to assemblypassed senate3rd reading cal.908substituted for s8450c
Jul 23, 2020
substituted by a10500c
Jul 22, 2020
ordered to third reading cal.908
Jul 20, 2020
reported and committed to rules
Jul 18, 2020
print number 8450c
Jul 18, 2020
amend and recommit to health
Jul 09, 2020
print number 8450b
Jul 09, 2020
amend and recommit to health
Jun 05, 2020
print number 8450a
Jun 05, 2020
amend and recommit to health
Jun 03, 2020
referred to health
UPDATE:
For the printing date condition:
from bs4 import BeautifulSoup
import requests
url = "https://www.nysenate.gov/legislation/bills/2019/s8450"
raw_html = requests.get(url).content
soup = BeautifulSoup(raw_html, "html.parser")
bill_life_cycle_table = soup.find("tbody").text.splitlines()
for a, b in zip(bill_life_cycle_table, bill_life_cycle_table[1:]):
if b.title() == "Delivered To Governor":
print(a)
Output:
Dec 11, 2020

You can read in the <table> tag with pandas' (it uses BeautifulSoup under the hood). then filter by the column and return the date.
Code:
import pandas as pd
url = "https://www.nysenate.gov/legislation/bills/2019/s8450"
df = pd.read_html(url)[0]
date = df[df.iloc[:,-1] == 'delivered to governor'].iloc[0,0]
Output:
print (date)
Dec 11, 2020

Scraping pagination via "page=" midway in url

I'm trying to scrape data from this webpage, and all 900+ pages that follow: https://hansard.parliament.uk/search/Contributions?endDate=2019-07-11&page=1&searchTerm=%22climate+change%22&startDate=1800-01-01&partial=True
It's important that the scraper does not target the pagination link, but rather iterates through the "page=" number in the url. This is because the data present is loaded dynamically in the original webpage, which the pagination links point back to.
I've tried writing something that loops through the page numbers in the url, via the "last" class of the pagination ul, to find the final page, but I am not sure how to target the specific part of the url, whilst keeping the search query the same for each result
r = requests.get(url_pagination)
soup = BeautifulSoup(r.content, "html.parser")
page_url = "https://hansard.parliament.uk/search/Contributions?endDate=2019-07-11&page={}" + "&searchTerm=%22climate+change%22&startDate=1800-01-01&partial=True"
last_page = soup.find('ul', class_='pagination').find('li', class_='last').a['href'].split('=')[1]
dept_page_url = [page_url.format(i) for i in range(1, int(last_page)+1)]
print(dept_page_url)
I would ideally like to scrape just the name from class "secondaryTitle", and the 2nd unnamed div that contains the date, per row.
I keep getting an error: ValueError: invalid literal for int() with base 10: '2019-07-11&searchTerm'

You could try this script, but beware, it goes from page 1 all the way to last page 966:
import requests
from bs4 import BeautifulSoup
next_page_url = 'https://hansard.parliament.uk/search/Contributions?endDate=2019-07-11&page=1&searchTerm=%22climate+change%22&startDate=1800-01-01&partial=True'
# this goes to page '966'
while True:
print('Scrapping {} ...'.format(next_page_url))
r = requests.get(next_page_url)
soup = BeautifulSoup(r.content, "html.parser")
for secondary_title, date in zip(soup.select('.secondaryTitle'), soup.select('.secondaryTitle + *')):
print('{: >20} - {}'.format(date.get_text(strip=True), secondary_title.get_text(strip=True)))
next_link = soup.select_one('a:has(span:contains(Next))')
if next_link:
next_page_url = 'https://hansard.parliament.uk' + next_link['href'] + '&partial=True'
else:
break
Prints:
Scrapping https://hansard.parliament.uk/search/Contributions?endDate=2019-07-11&page=1&searchTerm=%22climate+change%22&startDate=1800-01-01&partial=True ...
17 January 2007 - Ian Pearson
21 December 2017 - Baroness Vere of Norbiton
2 May 2019 - Lord Parekh
4 February 2013 - Baroness Hanham
21 December 2017 - Baroness Walmsley
9 February 2010 - Colin Challen
6 February 2002 - Baroness Farrington of Ribbleton
24 April 2007 - Barry Gardiner
17 January 2007 - Rob Marris
7 March 2002 - The Parliamentary Under-Secretary of State, Department for Environment, Food and Rural Affairs (Lord Whitty)
27 October 1999 - Mr. Tom Brake (Carshalton and Wallington)
9 February 2004 - Baroness Miller of Chilthorne Domer
7 March 2002 - The Secretary of State for Environment, Food and Rural Affairs (Margaret Beckett)
27 February 2007 -
8 October 2008 - Baroness Andrews
24 March 2011 - Lord Henley
21 December 2017 - Lord Krebs
21 December 2017 - Baroness Young of Old Scone
16 June 2009 - Mark Lazarowicz
14 July 2006 - Lord Rooker
Scrapping https://hansard.parliament.uk/search/Contributions?endDate=2019-07-11&searchTerm=%22climate+change%22&startDate=1800-01-01&page=2&partial=True ...
12 October 2006 - Lord Barker of Battle
29 January 2009 - Lord Giddens
... and so on.

Your error is because you are using the wrong number from your split. You want -1. Observe:
last_page = soup.find('ul', class_='pagination').find('li', class_='last').a['href']
print(last_page)
print(last_page.split('=')[1])
print(last_page.split('=')[-1])
Gives:
/search/Contributions?endDate=2019-07-11&searchTerm=%22climate+change%22&startDate=1800-01-01&page=966
when split and use 1
2019-07-11&searchTerm
versus -1
966
To get the info from each page you want I would do pretty much what the other answer does in terms of css selectors and zipping. Some other looping constructs below and use of Session for efficiency given number of requests.
You could make an initial request and extract the number of pages then loop for those. Use Session object for efficiency of connection re-use.
import requests
from bs4 import BeautifulSoup as bs
def make_soup(s, page):
page_url = "https://hansard.parliament.uk/search/Contributions?endDate=2019-07-11&page={}&searchTerm=%22climate+change%22&startDate=1800-01-01&partial=True"
r = s.get(page_url.format(page))
soup = bs(r.content, 'lxml')
return soup
with requests.Session() as s:
soup = make_soup(s, 1)
pages = int(soup.select_one('.last a')['href'].split('page=')[1])
for page in range(2, pages + 1):
soup = make_soup(s, page)
#do something with soup
You could loop until class last ceases to appear
import requests
from bs4 import BeautifulSoup as bs
present = True
page = 1
#results = {}
def make_soup(s, page):
page_url = "https://hansard.parliament.uk/search/Contributions?endDate=2019-07-11&page={}&searchTerm=%22climate+change%22&startDate=1800-01-01&partial=True"
r = s.get(page_url.format(page))
soup = bs(r.content, 'lxml')
return soup
with requests.Session() as s:
while present:
soup = make_soup(s, page)
present = len(soup.select('.last')) > 0
#results[page] = soup.select_one('.pagination-total').text
#extract info
page+=1

Web Scraping Ali Express Order Data

I am trying to scrape some data from the AliExpress website, and I have no clue on how to proceed. Started to do this manually but this will easily take me hours I guess :/ I basically would like to extract the following datasets:
(i) Orders per country
For a given product, I want to get the ~1000 last orders with the country of destination in an excel. For example, take the following product: https://www.aliexpress.com/item/Bluedio-T4S-Active-Noise-Cancelling-Wireless-Bluetooth-Headphones-wireless-Headset-with-Mic/32821244791.html?spm=2114.search0103.3.1.3b0615cfrdkG5X&ws_ab_test=searchweb0_0,searchweb201602_1_10152_10151_10065_10344_10068_10342_10343_10340_10341_10084_10083_10618_10304_10307_10306_10302_5711211_10313_10059_10534_100031_10103_10627_10626_10624_10623_10622_5722411_10621_10620_5711311,searchweb201603_25,ppcSwitch_5&algo_expid=ce68d26f-337b-49ac-af00-48c5b4c4c5c4-0&algo_pvid=ce68d26f-337b-49ac-af00-48c5b4c4c5c4&transAbTest=ae803_3&priceBeautifyAB=0
Picture:Transaction history
Here my goal is to get an excel with columns : Date (or some other unique identifier) - country of buyer - number of pieces.
So for the first buyer on the picture this would be something like "10 Mar 2018 00:11" - "RU" - "1 piece". And then this for about 100-120 of these pages (so about ~1000 customers in total) in a CSV file.
Could anyone help me with how to code this in e.g., Python? Or any ideas on tools that I can use?
(ii) Total orders per subcategory
For a given (sub)category, for example, "Beauty and Health - Health care" (https://www.aliexpress.com/category/200002496/health-care.html?spm=2114.search0103.3.19.696619daL05kcB&site=glo&g=y) I would like to sum all the orders across the 100 pages of products. On the picture the orders are circled in yellow.
Picture:Products with number of orders
So output could be simply the total number of orders within this category. (This would be a sum over 100 pages of 48 products per page)
Is this something that would be possible in Python? I have some very basic experience with Python, but not enough to actually build something like this.
Would be very grateful if someone could help me get started!
Thanks a lot in advance!
Bruce
UPDATE: I managed to do (i) thanks to Delirious Lettuce. For (ii) I have built the below code, which works fine for ~5 pages, but starts omitting products/ jumping around after this. Is this because of the code? Or could this be because perhaps they restrict pulling too much data from the server?
import bs4
import csv
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
filename="Dresses.csv"
f=open(filename,"w")
headers="product_ID, orders\n"
f.write(headers)
for p in range(1,100):
my_url='https://www.aliexpress.com/category/200003482/dresses/' + str(p)
+'.html?site=glo&g=y&SortType=total_tranpro_desc&needQuery=n&tag='
#had to split the above link because it did not fit on one line
uClient=uReq(my_url)
page_html=uClient.read()
uClient.close()
page_soup=soup(page_html,"html.parser")
containers=page_soup.findAll("div",{"class":"item"})
for container in containers:
em_order = container.em
order_num = em_order.text
product_ID = container.input["value"]
f.write(product_ID + "," + order_num + "\n")
f.close()

Partial answer since I don't have time to look at part 2 right now but here is a solution for part 1 that I wrote using Python 3.6.4. I'll try to update later with part 2 as well.
import csv
import requests
def _get_transactions(*, product_id, page_num):
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
params = {
'productId': product_id,
'type': 'default',
'page': page_num
}
url = 'https://feedback.aliexpress.com/display/evaluationProductDetailAjaxService.htm'
r = requests.get(url, headers=headers, params=params)
r.raise_for_status()
return r.json()
def get_product_transactions(*, product_id, transaction_pages=1):
transactions = []
for page_num in range(1, transaction_pages + 1):
current_transactions = _get_transactions(
product_id=product_id,
page_num=page_num
)
transactions.extend(current_transactions['records'])
return transactions
if __name__ == '__main__':
product_id = '32821244791'
transactions = get_product_transactions(
product_id=product_id,
transaction_pages=3
)
with open('{}_transactions.csv'.format(product_id), 'w') as f:
writer = csv.DictWriter(f, fieldnames=('date', 'country', 'pieces'))
writer.writeheader()
for transaction in transactions:
writer.writerow({
'date': transaction['date'],
'country': transaction['countryCode'],
'pieces': transaction['quantity']
})
Output file '32821244791_transactions.csv'
date,country,pieces
12 Mar 2018 14:42,hu,1
12 Mar 2018 14:16,be,1
12 Mar 2018 13:47,kr,1
12 Mar 2018 13:25,br,1
12 Mar 2018 13:13,ru,3
12 Mar 2018 12:41,fr,1
12 Mar 2018 11:42,es,1
12 Mar 2018 11:15,ru,1
12 Mar 2018 11:05,ru,1
12 Mar 2018 10:45,ro,1
12 Mar 2018 10:44,ru,1
12 Mar 2018 10:00,kz,1
12 Mar 2018 10:00,in,1
12 Mar 2018 09:51,fr,1
12 Mar 2018 09:39,nl,1
12 Mar 2018 09:26,fr,1
12 Mar 2018 09:24,ru,1
12 Mar 2018 09:19,cz,1
12 Mar 2018 09:00,ru,1
12 Mar 2018 08:46,ru,1
12 Mar 2018 08:33,no,1
12 Mar 2018 08:32,pl,1
12 Mar 2018 08:21,br,1
12 Mar 2018 08:20,ru,1

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

webscrape and splitting of retrieved data into different lines - python

Related

Extract elements between two tags with Beautiful Soup and Python

Scrape information about iXBRL link inside source code with python Beautiful Soup

Returning table information based on a condition in Beautiful Soup/Python

Scraping pagination via "page=" midway in url

Web Scraping Ali Express Order Data

Categories

Resources