how to match data from a linked pages together by web scraping - python

i'm trying to scrap some data from a web site and i have an issue in matching the data from every subpage to the data of the main page
for Expample: the main page have a country name "Alabama Trucking Companies" and when i enter to it link, i'll found some cities(Abbeville, Adamsville,...etc), i need to clarify every city details (city name and city link) with it's country name
country names that i scraped from the main page:
city names that i scraped from the sub page:
the below code that i used is extracting the data from the main and sub pages individually without matching them to other, So how can i solve this issue please.
The code that i've used:-
start_time = datetime.now()
url = 'https://www.quicktransportsolutions.com/carrier/usa-trucking-companies.php'
page_country = requests.get(url).content
soup_country = BeautifulSoup(page_country, 'lxml')
countries = soup_country.find('div',{'class':'col-xs-12 col-sm-9'})
countries_list = []
country_info = countries.find_all('div',{'class':'col-md-4 column'})
for i in country_info:
title_country = i.text.strip()
href_country = i.find('a', href=True)['href']
countries_list.append({'Country Title':title_country, 'Link':(f'https://www.quicktransportsolutions.com//carrier//{href_country}')})
countries_links = []
for i in pd.DataFrame(countries_list)['Link']:
page_city = requests.get(i).content
soup_city = BeautifulSoup(page_city, 'lxml')
city = soup_city.find('div',{'align':'center','class':'table-responsive'})
countries_links.append(city)
cities_list = []
for i in countries_links:
city_info = i.find_all('td',"")
for i in city_info:
title_city = i.text.strip()
try:
href_city = i.find('a', href=True)['href']
except:
continue
cities_list.append({'City Title':title_city,'City Link':href_city})
end_time = datetime.now()
print(f'Duration: {end_time - start_time}')
df = pd.DataFrame(cities_list)
df = df.loc[df['City Link']!= '#'].drop_duplicates().reset_index(drop=True)
df
The expected data to see for every country is the below:-

Instead of parsing all of the state links and adding them to a list prior to crawling each of the city pages, what you can do is parse each states extract their link, then immediately follow the link to get all of the cities for that state before moving on the the next state, and then append all the information to one master list at one time.
For example:
start_time = datetime.now()
url = 'https://www.quicktransportsolutions.com/carrier/usa-trucking-companies.php'
page_country = requests.get(url).content
soup_country = BeautifulSoup(page_country, 'lxml')
countries = soup_country.find('div',{'class':'col-xs-12 col-sm-9'})
data_list = []
country_info = countries.find_all('div',{'class':'col-md-4 column'})
for i in country_info:
title_country = i.text.strip()
href_country = i.find('a', href=True)['href']
link = f'https://www.quicktransportsolutions.com/carrier/{href_country}'
page_city = requests.get(link).content
soup_city = BeautifulSoup(page_city, 'lxml')
city = soup_city.find('div',{'align':'center','class':'table-responsive'})
city_info = city.find_all('td',"")
for i in city_info:
title_city = i.text.strip()
try:
href_city = i.find('a', href=True)['href']
except:
continue
row = {
'Country Title':title_country,
'Link':link,
'City Title':title_city,
'City Link':href_city
}
data_list.append(row)
end_time = datetime.now()
print(f'Duration: {end_time - start_time}')
df = pd.DataFrame(data_list)
df = df.loc[df['City Link']!= '#'].drop_duplicates().reset_index(drop=True)

Related

Trying to export data taken from HTML using beautiful soup

I'm trying to extract information from the HTML texts that I get from URLs that I create from a For loop and then use beautiful soup.
I get to isolate the information correctly but when I'm trying to export the data I get an error message "All arrays must be of the same length"
weblink = []
filing_type = []
company_name = []
date = []
#Importing file
df = pd.read_csv('Downloads\Dropped_Companies.csv')
#Getting companie's names into list
companies_column=list(df.columns.values)[4]
name_ = df[companies_column].tolist()
#Formatting company's names for creating URLs
for CompanyName in name_:
company_name.append(CompanyName.lower().replace(" ",'_'))
company_name
for item in range(0, len(company_name)):
link = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&company=' + company_name[item] + '&type=10-K&dateb=&owner=exclude&count=100'
#Getting the HTML text
headers = random.choice(headers_list)
r = requests.Session()
r.headers = headers
html = r.get(link).text
#Calling beautiful soup for better HTML text
soup = bs.BeautifulSoup(html)
tet_ = soup.find_all("a", id = "documentsbutton")
#Get the links
for link in tet_:
weblink.append('https://www.sec.gov' + link.get('href'))
test11 = soup.find_all("table", class_= "tableFile2")
for link in test11:
row = link.find_all("td", nowrap = "nowrap")
for i in range(0, len(row), 3):
filing_type.append(row[i].getText())
date.append(link.find("td", class_ = "small").find_next_sibling("td").text)
name.append(company_name[item])
data = {'Company Name':name,'Filing Date': date,'Filing Type':filing_type,"Weblink":weblink}
outputdf = pd.DataFrame(data)
outputdf.to_csv('Downloads/t_10KLinks.csv')
data = {'Company Name':name,'Filing Date': date,'Filing Type':filing_type,"Weblink":weblink}
outputdf = pd.DataFrame.from_dict(data, orient='index')
outputdf.to_csv('Downloads/t_10KLinks.csv')
pandas.DataFrame.from_dict documentation.

Multiple requests to Scrape different pages is giving the same result

I want to scrape the ICC cricket website and find the rankings of batsmen on a particular date. The problem is that the result that I'm getting is the same for all the dates. The scraping is giving me the list of the most recent rankings rather than the ranking at a particular date. The code is given below. Can someone tell me why this is happening/ a solution for the same?
I feel that the problem is that beautifulsoup is not letting the page load completely which is in turn giving false information as the information required is received after applying filters on the website.
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import datetime
import os
date_list = pd.date_range(start = "1971-02-01", end=datetime.date.today(), freq='1d')
def get_batsmen(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting?at={date}'
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_bowler(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling?at={date}'
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_allrounder(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/all-rounder?at={date}'
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
#Storing the data into multiple csvs
for date in date_list:
year = date.year
month = date.month
day = date.day
newpath = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}'
if not os.path.exists(newpath):
os.makedirs(newpath)
newpath1 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}'
if not os.path.exists(newpath1):
os.makedirs(newpath1)
newpath2 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}\{day}'
if not os.path.exists(newpath2):
os.makedirs(newpath2)
get_batsmen(date).to_csv(newpath2+'/batsmen.csv')
get_bowler(date).to_csv(newpath2+'/bowler.csv')
get_allrounder(date).to_csv(newpath2+'/allrounder.csv')
In case the website that you're scraping is interactive it can be good to look at Selenium as scraping package instead of bs4 so javascript is executed.

I'm only scraping the first element of each page using BeautifulSoup, my goal is to scrape all elements within the page, What am I doing wrong?

I'm trying to to scrape the public contact info of all the persons in each page of the website so I build 3 functions, one to modify the URL, one to extract the source code from it using BeautifulSoup and one to transform it and finally get the name, title, email, personal website and bio, but for some reason, I'm only getting back the first element of each page, it does covers the total amount of pages but it only scrapes the first person.
Here's how I wrote the code in that part
def paginas(pages):
pag = (f'https://www.hsph.harvard.edu/profiles/page/{pages}/')
return pag
def extract(pag):
url = requests.get(pag).text
soup = BeautifulSoup(url, 'lxml')
return soup
def transform(soup):
#principal = soup.find('div', class_ = 'hsph-bootstrap')
items = soup.find_all('div', class_ = 'grid-card grid-card-hover position-relative border rounded px-4 py-5')
for item in items:
try:
#name = item.find('a').text.strip() this is another way of getting it in this website
name = item.find('h2', class_ = 'h3 mb-0').text.strip() #siempre tendrá
except:
name = 'not given'
#Contact data.
website = item.find('h2', class_ = 'h3 mb-0').a['href']
main = item.find('div', class_ = 'grid-card-content')
bio = main.find('div', class_ = 'faculty-bio small').text.strip()
university = 'Harvard School of Public Health'
#INSIDE THE LINK
wd = webdriver.Chrome(options=options)
wd.get(website)
insideurl = requests.get(website).text
insidesoup = BeautifulSoup(insideurl, 'lxml')
#BIO DATA
insitem = insidesoup.find('div', class_ ='row rounded bg-white p-5')
try:
email = insitem.find('p', class_ = 'faculty-contact mb-2').text.strip()
except:
email = ''
try:
ti = insitem.find('div', class_ = 'faculty-bio')
title = ti.find('p').text
except:
ti = ''
title = ''
#EXTRA DATA ON BIO.
try:
bio2 = insidesoup.find('div', class_ = 'faculty-profile-container container mb-5')
complete = bio2.find('div', class_ = 'faculty-profile-overview-section').text.strip()
except:
bio2 = ''
complete = ''
contact = {
'name' : name,
'title' : title,
'university' : university,
'email' : email,
'website' : website,
'bio' : complete,
'area' : bio,
}
leadlist.append(lead)
return
leadlist =[]
for pages in range(1,127,1):
c = paginas(pages)
b = extract(c)
d = transform(b)
print(len(leadlist))

Python 3 - Pandas, DataFrame, ValueError: cannot set a row with mismatched columns

I see others are having issues with this as well, but I haven't found a working solution from other posts yet, and maybe because I'm not implementing it properly yet. I'm making a scraper for a job posting website and this is the section of code I'm working/having trouble with:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
# URL of specific job search
# FUTURE TO DO - Break it up into editable variables based on URL structure
URL = 'https://www.indeed.ca/jobs?q=data+scientist,+data+analyst,+python&l=Canada&jt=fulltime'
# conducting a request of the stated URL above:
page = requests.get(URL)
# specifying a desired format of “page” using the html parser - this allows python to read the various components of the page, rather than treating it as one long string.
soup = BeautifulSoup(page.text, 'html.parser')
# printing soup in a more structured tree format that makes for easier reading
# print(soup.prettify())
# Extract job title
def extract_job_title_from_result(soup):
jobs = []
for div in soup.findAll(name='div', attrs={'class':'row'}):
for a in div.findAll(name='a', attrs={'data-tn-element':'jobTitle'}):
jobs.append(a['title'])
return(jobs)
extract_job_title_from_result(soup)
# Extract company
def extract_company_from_result(soup):
companies = []
for div in soup.findAll(name='div', attrs={'class':'row'}):
company = div.findAll(name='span', attrs={'class':'company'})
if len(company) > 0:
for b in company:
companies.append(b.text.strip())
else:
sec_try = div.findAll(name='span', attrs={'class':'result-link-source'})
for span in sec_try:
companies.append(span.text.strip())
return(companies)
extract_company_from_result(soup)
# Extract location
def extract_location_from_result(soup):
locations = []
spans = soup.findAll('span', attrs={'class': 'location'})
for span in spans:
locations.append(span.text)
return(locations)
extract_location_from_result(soup)
# Extract salary
def extract_salary_from_result(soup):
salaries = []
for div in soup.findAll(name='div', attrs={'class':'row'}):
try:
salaries.append(div.find('nobr').text)
except:
try:
div_two = div.find(name='div', attrs={'class':'sjcl'})
div_three = div_two.find('div')
salaries.append(div_three.text.strip())
except:
salaries.append('Nothing_found')
return(salaries)
extract_salary_from_result(soup)
# Extract job summary
# FUTURE TO DO - Extract full job description by each job page posting
# ie. going through the link
def extract_summary_from_result(soup):
summaries = []
spans = soup.findAll('span', attrs={'class': 'summary'})
for span in spans:
summaries.append(span.text.strip())
return(summaries)
extract_summary_from_result(soup)
# Max results per city, which cities, and an output for the data
max_results_per_city = 10
city_set = ['Canada'] # 'New+York','Chicago','San+Francisco', 'Austin', 'Seattle', 'Los+Angeles', 'Philadelphia', 'Atlanta', 'Dallas', 'Pittsburgh', 'Portland', 'Phoenix', 'Denver', 'Houston', 'Miami', 'Washington+DC', 'Boulder']
columns = ['city', 'job_title', 'company_name', 'location', 'summary', 'salary']
sample_df = pd.DataFrame(columns = columns)
#scraping code:
for city in city_set:
for start in range(0, max_results_per_city, 10):
page = requests.get('http://www.indeed.ca/jobs?q=data+scientist,+data+analyst,+python&l=' + str(city) + '&jt=fulltime') # + '&start=' + str(start))
time.sleep(1) #ensuring at least 1 second between page grabs
soup = BeautifulSoup(page.text, 'lxml')
for div in soup.find_all(name='div', attrs={'class':'row'}):
#creating an empty list to hold the data for each posting
job_post = []
#append city name
job_post.append(city)
#grabbing job title
for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
job_post.append(a['title'])
#grabbing company name
company = div.find_all(name='span', attrs={'class':'company'})
if len(company) > 0:
[job_post.append(b.text.strip()) for b in company]
else:
[job_post.append(span.text) for span in div.find_all(name='span', attrs={'class':'result-link-source'})]
#grabbing location name
[job_post.append(span.text) for span in div.findAll('span', attrs={'class': 'location'})]
#grabbing summary text
[job_post.append(span.text.strip()) for span in div.findAll('span', attrs={'class': 'summary'})]
#grabbing salary
div_two = div.find(name='div', attrs={'class':'salarySnippet'})
job_post.append(div_two.text.strip() if div_two else 'Nothing found')
#appending list of job post info to dataframe at index num
sample_df.loc[len(sample_df) + 1] = job_post
#saving sample_df as a local csv file — define your own local path to save contents
sample_df.to_csv('[filepath].csv', encoding='utf-8')
I seem to be having issues with the second line, or the very last line. Receiving the error:
Traceback (most recent call last):
File "script.py", line 128, in <module>
sample_df.loc[len(sample_df) + 1] = job_post
File "C:\Users\...Python\Python36\lib\site-packages\pandas\core\indexing.py", line 194, in __setitem__
self._setitem_with_indexer(indexer, value)
File "C:\Users\...\Python\Python36\lib\site-packages\pandas\core\indexing.py", line 439, in _setitem_with_indexer
raise ValueError("cannot set a row with "
ValueError: cannot set a row with mismatched columns
I saw a few solutions of using .append instead of .DataFrame, however I receive an error that Pandas doesn't use .append, or something along those lines. Any suggestions?
This was an older code from last year that I'm taking from:
https://medium.com/#msalmon00/web-scraping-job-postings-from-indeed-96bd588dcb4b
Thanks in advance!
I cannot reproduce the error. I added the dependencies (pandas and time) and made some assumptions about the unspecified objects (city_set and max_results_per_city) and and I get a DataFrame with all the entries. I changed some of your code for the salary, as the structure of the website seems to have changed. I did not run any extended tests, though.
import pandas as pd
import time
columns = ['city', 'job_title', 'company_name', 'location', 'summary', 'salary']
sample_df = pd.DataFrame(columns = columns)
city_set = ('Toronto, ON', 'Calgary, AB', 'Montréal, QC')
max_results_per_city = 30
#scraping code:
for city in city_set:
for start in range(0, max_results_per_city, 10):
page = requests.get('http://www.indeed.ca/jobs?q=data+scientist,+data+analyst,+python&l=' + str(city) + '&jt=fulltime') # + '&start=' + str(start))
time.sleep(1) #ensuring at least 1 second between page grabs
soup = BeautifulSoup(page.text, 'lxml')
for div in soup.find_all(name='div', attrs={'class':'row'}):
#creating an empty list to hold the data for each posting
job_post = []
#append city name
job_post.append(city)
#grabbing job title
for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
job_post.append(a['title'])
#grabbing company name
company = div.find_all(name='span', attrs={'class':'company'})
if len(company) > 0:
[job_post.append(b.text.strip()) for b in company]
else:
[job_post.append(span.text) for span in div.find_all(name='span', attrs={'class':'result-link-source'})]
#grabbing location name
[job_post.append(span.text) for span in div.findAll('span', attrs={'class': 'location'})]
#grabbing summary text
[job_post.append(span.text.strip()) for span in div.findAll('span', attrs={'class': 'summary'})]
#grabbing salary
div_two = div.find(name='div', attrs={'class':'salarySnippet'})
job_post.append(div_two.text.strip() if div_two else 'Nothing found')
#appending list of job post info to dataframe at index num
sample_df.loc[len(sample_df) + 1] = job_post
Well, I didn't get the main solution to work, but I did a workaround just using a .writerow and it works just as fine. I'll play with the dataframe thing later. Thanks everyone!
#scraping code:
with open('output.csv', 'a', newline='') as f_output:
csv_output = csv.writer(f_output) #delimiter=",")
for city in city_set:
for start in range(0, max_results_per_city, 10):
page = requests.get('http://www.indeed.ca/jobs?q=data+scientist,+data+analyst,+python&l=' + str(city) + '&jt=fulltime') # + '&start=' + str(start))
time.sleep(1) #ensuring at least 1 second between page grabs
soup = BeautifulSoup(page.text, 'lxml')
for div in soup.find_all(name='div', attrs={'class':'row'}):
#creating an empty list to hold the data for each posting
job_post = []
#append city name
job_post.append(city)
#grabbing job title
for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
job_post.append(a['title'])
#grabbing company name
company = div.find_all(name='span', attrs={'class':'company'})
if len(company) > 0:
[job_post.append(b.text.strip()) for b in company]
else:
[job_post.append(span.text) for span in div.find_all(name='span', attrs={'class':'result-link-source'})]
#grabbing location name
[job_post.append(span.text) for span in div.findAll('span', attrs={'class': 'location'})]
#grabbing summary text
[job_post.append(span.text.strip()) for span in div.findAll('span', attrs={'class': 'summary'})]
#grabbing salary
div_two = div.find(name='div', attrs={'class':'salarySnippet'})
job_post.append(div_two.text.strip() if div_two else 'Nothing found')
#appending list of job post info to dataframe at index num
#sample_df.loc[len(sample_df) + 1] = job_post
#saving sample_df as a local csv file — define your own local path to save contents
csv_output.writerow([job_post])
#sample_df.to_csv('[filepath].csv', encoding='utf-8')

how to add scraped data in csv file?

I am new to web scraping.I am scraping data from a website where i scraped first page href and then i go to each href and find the 'p tag' in class 'address-data'.i want to store one url 'p tag' data in one row and second url 'p tag' tag in second row.My data is appended in 'myUrl'.I want save data in csv file eg, address,longitudelatitude,phone,email then new line starts.
here is my code:
from bs4 import BeautifulSoup
import requests
import csv
myUrl=[]
urls = ["http://www.shaditayari.pk/s&category=326&location=266&a=true&paged{}".format(i) for i in range(1, 10)] # make a url list and iterate over it
for url in urls:
r = requests.get(url)
print('idr1')
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.find_all('a', {'main-link'}):
iurl=link.get('href')
r = requests.get(iurl)
print(iurl)
soup = BeautifulSoup(r.content, "lxml")
with open ('lhr.cv','wb') as file:
divs = soup.find_all('div',attrs={"class":"address-data"})
for div in divs:
myUrl.append(div.find('p').text)
#print(myUrl)
with open ('lhr.cv','w') as file:
writer=csv.writer(file)
for row in myUrl:
writer.writerow(row)
expected output:
9 Fane Road، Lahore 54000, Pakistan|1.561381309140028|74.31484723624567|042-37363901-9|gm#bestwesternlahore.com/sales#bestwesternlahore.com/ reservations#bestwesternlahore.com
1/E-3, Main Boulevard Gulberg III, Lahore|31.525700029363|74.34930089283|0305-2960614|https://www.facebook.com/pages/Zauk-Banquet-Hall/204612846290857
I've written this in Python 2 and using xpaths (because I think they're cleaner and simpler to use for webscraping), but this code will get you your list of links:
#Load required libraries
import requests
from lxml import html
import pandas as pd
#Create base URL
url = "http://www.shaditayari.pk/?s&post_type=ait-item&a=true&paged="
#First, we want to work out the number of pages to scrape. We load any page and get the largest page number
page = requests.get(url+str(1))
tree = html.fromstring(page.content)
no_pages = tree.xpath("//nav/a[last()]/text()")[0] #This comes out as a list of two - we only want the first one
#Next, we want to scrape the links to each page with the address
links = []
names = []
for i in range(1,int(no_pages)+1):
page = requests.get(url+str(i))
tree = html.fromstring(page.content)
page_links = tree.xpath("//div[#class = 'item-title']/a/#href")
page_names = tree.xpath("//a/h3/text()")
links = links + page_links
names = names + page_names
print i
address links = {"Name": names,
"URL": links}
pd.DataFrame.to_csv(u"address_links.csv")
This code needs completing, with the append, the dictionary completion, and a line to create a CSV, but it will get your details:
address_list = []
latitude_list = []
longitude_list = []
telephone_list = []
email_list = []
webpage_list = []
counter = 0
for url in address_links["URL"]:
page = requests.get("http://www.shaditayari.pk/businesses/rizwan-beyg/")
tree = html.fromstring(page.content)
address = tree.xpath("//div[#itemprop = 'streetAddress']/p/text()")
if len(address) == 0:
address == ""
else:
address == address[0]
latitude = tree.xpath("//p/meta[#itemprop = 'latitude']/#content")
if len(latitude) == 0:
latitude = ""
else:
latitude = latitude[0]
longitude = tree.xpath("//p/meta[#itemprop = 'latitude']/#content")
if len(longitude) == 0:
longitude = ""
else:
longitude = longitude[0]
telephone = tree.xpath("//a[#class = 'phone']/text()")
if len(telephone) == 0:
telephone = ""
else:
telephone = telephone[0]
email = tree.xpath("//a[#itemprop = 'email']/text()")
if len(email) == 0:
email = ""
else:
email = email[0]
webpage = tree.xpath("//a[#itemprop = 'url']/#href")
if len(webpage) == 0:
webpage = ""
else:
webpage = webpage[0]
address_list.append(address)
#continue for others
counter+=1
print counter
address_details = {"Name": names,
"URL": links,
"Address": address_list,
#continue for others
}
You might need to add in some unicode encoding before you turn it into a CSV. That's answered here.

Categories

Resources