how to add scraped data in csv file? - python

I am new to web scraping.I am scraping data from a website where i scraped first page href and then i go to each href and find the 'p tag' in class 'address-data'.i want to store one url 'p tag' data in one row and second url 'p tag' tag in second row.My data is appended in 'myUrl'.I want save data in csv file eg, address,longitudelatitude,phone,email then new line starts.
here is my code:
from bs4 import BeautifulSoup
import requests
import csv
myUrl=[]
urls = ["http://www.shaditayari.pk/s&category=326&location=266&a=true&paged{}".format(i) for i in range(1, 10)] # make a url list and iterate over it
for url in urls:
r = requests.get(url)
print('idr1')
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.find_all('a', {'main-link'}):
iurl=link.get('href')
r = requests.get(iurl)
print(iurl)
soup = BeautifulSoup(r.content, "lxml")
with open ('lhr.cv','wb') as file:
divs = soup.find_all('div',attrs={"class":"address-data"})
for div in divs:
myUrl.append(div.find('p').text)
#print(myUrl)
with open ('lhr.cv','w') as file:
writer=csv.writer(file)
for row in myUrl:
writer.writerow(row)
expected output:
9 Fane RoadŲŒ Lahore 54000, Pakistan|1.561381309140028|74.31484723624567|042-37363901-9|gm#bestwesternlahore.com/sales#bestwesternlahore.com/ reservations#bestwesternlahore.com
1/E-3, Main Boulevard Gulberg III, Lahore|31.525700029363|74.34930089283|0305-2960614|https://www.facebook.com/pages/Zauk-Banquet-Hall/204612846290857

I've written this in Python 2 and using xpaths (because I think they're cleaner and simpler to use for webscraping), but this code will get you your list of links:
#Load required libraries
import requests
from lxml import html
import pandas as pd
#Create base URL
url = "http://www.shaditayari.pk/?s&post_type=ait-item&a=true&paged="
#First, we want to work out the number of pages to scrape. We load any page and get the largest page number
page = requests.get(url+str(1))
tree = html.fromstring(page.content)
no_pages = tree.xpath("//nav/a[last()]/text()")[0] #This comes out as a list of two - we only want the first one
#Next, we want to scrape the links to each page with the address
links = []
names = []
for i in range(1,int(no_pages)+1):
page = requests.get(url+str(i))
tree = html.fromstring(page.content)
page_links = tree.xpath("//div[#class = 'item-title']/a/#href")
page_names = tree.xpath("//a/h3/text()")
links = links + page_links
names = names + page_names
print i
address links = {"Name": names,
"URL": links}
pd.DataFrame.to_csv(u"address_links.csv")
This code needs completing, with the append, the dictionary completion, and a line to create a CSV, but it will get your details:
address_list = []
latitude_list = []
longitude_list = []
telephone_list = []
email_list = []
webpage_list = []
counter = 0
for url in address_links["URL"]:
page = requests.get("http://www.shaditayari.pk/businesses/rizwan-beyg/")
tree = html.fromstring(page.content)
address = tree.xpath("//div[#itemprop = 'streetAddress']/p/text()")
if len(address) == 0:
address == ""
else:
address == address[0]
latitude = tree.xpath("//p/meta[#itemprop = 'latitude']/#content")
if len(latitude) == 0:
latitude = ""
else:
latitude = latitude[0]
longitude = tree.xpath("//p/meta[#itemprop = 'latitude']/#content")
if len(longitude) == 0:
longitude = ""
else:
longitude = longitude[0]
telephone = tree.xpath("//a[#class = 'phone']/text()")
if len(telephone) == 0:
telephone = ""
else:
telephone = telephone[0]
email = tree.xpath("//a[#itemprop = 'email']/text()")
if len(email) == 0:
email = ""
else:
email = email[0]
webpage = tree.xpath("//a[#itemprop = 'url']/#href")
if len(webpage) == 0:
webpage = ""
else:
webpage = webpage[0]
address_list.append(address)
#continue for others
counter+=1
print counter
address_details = {"Name": names,
"URL": links,
"Address": address_list,
#continue for others
}
You might need to add in some unicode encoding before you turn it into a CSV. That's answered here.

Related

how to match data from a linked pages together by web scraping

i'm trying to scrap some data from a web site and i have an issue in matching the data from every subpage to the data of the main page
for Expample: the main page have a country name "Alabama Trucking Companies" and when i enter to it link, i'll found some cities(Abbeville, Adamsville,...etc), i need to clarify every city details (city name and city link) with it's country name
country names that i scraped from the main page:
city names that i scraped from the sub page:
the below code that i used is extracting the data from the main and sub pages individually without matching them to other, So how can i solve this issue please.
The code that i've used:-
start_time = datetime.now()
url = 'https://www.quicktransportsolutions.com/carrier/usa-trucking-companies.php'
page_country = requests.get(url).content
soup_country = BeautifulSoup(page_country, 'lxml')
countries = soup_country.find('div',{'class':'col-xs-12 col-sm-9'})
countries_list = []
country_info = countries.find_all('div',{'class':'col-md-4 column'})
for i in country_info:
title_country = i.text.strip()
href_country = i.find('a', href=True)['href']
countries_list.append({'Country Title':title_country, 'Link':(f'https://www.quicktransportsolutions.com//carrier//{href_country}')})
countries_links = []
for i in pd.DataFrame(countries_list)['Link']:
page_city = requests.get(i).content
soup_city = BeautifulSoup(page_city, 'lxml')
city = soup_city.find('div',{'align':'center','class':'table-responsive'})
countries_links.append(city)
cities_list = []
for i in countries_links:
city_info = i.find_all('td',"")
for i in city_info:
title_city = i.text.strip()
try:
href_city = i.find('a', href=True)['href']
except:
continue
cities_list.append({'City Title':title_city,'City Link':href_city})
end_time = datetime.now()
print(f'Duration: {end_time - start_time}')
df = pd.DataFrame(cities_list)
df = df.loc[df['City Link']!= '#'].drop_duplicates().reset_index(drop=True)
df
The expected data to see for every country is the below:-
Instead of parsing all of the state links and adding them to a list prior to crawling each of the city pages, what you can do is parse each states extract their link, then immediately follow the link to get all of the cities for that state before moving on the the next state, and then append all the information to one master list at one time.
For example:
start_time = datetime.now()
url = 'https://www.quicktransportsolutions.com/carrier/usa-trucking-companies.php'
page_country = requests.get(url).content
soup_country = BeautifulSoup(page_country, 'lxml')
countries = soup_country.find('div',{'class':'col-xs-12 col-sm-9'})
data_list = []
country_info = countries.find_all('div',{'class':'col-md-4 column'})
for i in country_info:
title_country = i.text.strip()
href_country = i.find('a', href=True)['href']
link = f'https://www.quicktransportsolutions.com/carrier/{href_country}'
page_city = requests.get(link).content
soup_city = BeautifulSoup(page_city, 'lxml')
city = soup_city.find('div',{'align':'center','class':'table-responsive'})
city_info = city.find_all('td',"")
for i in city_info:
title_city = i.text.strip()
try:
href_city = i.find('a', href=True)['href']
except:
continue
row = {
'Country Title':title_country,
'Link':link,
'City Title':title_city,
'City Link':href_city
}
data_list.append(row)
end_time = datetime.now()
print(f'Duration: {end_time - start_time}')
df = pd.DataFrame(data_list)
df = df.loc[df['City Link']!= '#'].drop_duplicates().reset_index(drop=True)

Create a specific Web Scraper

I am making the effort to learn to scrape in Python and in this case my idea is to make a tool that obtains data from a web page. I have a problem in proposing the "for" to go through the page and collect the data of each box (item) as they are:
IDoffer
List
Title
Location
content
phone
It is not a task, it is my own initiative but I am not moving forward for which I thank you for your help.
Here is what I have of code:
from bs4 import BeautifulSoup
import requests
URL_BASE = "https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina="
MAX_PAGES = 2
counter = 0
for i in range(0, MAX_PAGES):
#Building the URL
if i > 0:
url = "%s%d" % (URL_BASE, i)
else:
url = URL_BASE
#We make the request to the web
req = requests.get(url)
#We check that the request returns a Status Code = 200
statusCode = req.status_code
if statusCode == 200:
#We pass the HTML content of the web to a BeautifulSoup () object
html = BeautifulSoup(req.text, "html.parser")
#We get all the divs where the inputs are
entradas_IDoffer = html.find_all('div', {'class': 'aditem-header'})
#We go through all the inputs and extract info
for entrada1 in entradas_IDoffer:
#THIS ARE SOME ATTEMPS
#Title = entrada.find('div', {'class': 'aditem-detail-title'}).getText()
#location = entrada.find('div', {'class': 'list-location-region'}).getText()
#content = entrada.find('div', {'class': 'tx'}).getText()
#phone = entrada.find('div', {'class': 'telefonos'}).getText()
#Offer Title
entradas_Title = html.find_all('div', {'class': 'aditem-detail'})
for entrada2 in entradas_Title:
counter += 1
Title = entrada2.find('a', {'class': 'aditem-detail-title'}).getText()
counter += 1
IDoffer = entrada1.find('div', {'class': 'x5'}).getText()
#Location
#entradas_location = html.find_all('div', {'class': 'aditem-detail'})
#for entrada4 in entradas_location:
# counter += 1
# location = entrada4.find('div', {'class': 'list-location-region'}).getText()
#Offer content
#entradas_content = html.find_all('div', {'class': 'aditem-detail'})
#for entrada3 in entradas_content:
# counter += 1
# content = entrada3.find('div', {'class': 'tx'}).getText()
print("%d - %s \n%s\n%s" % (counter, IDoffer.strip(),url,Title))
else:
try:
r = requests.head(req)
print(r.status_code)
except requests.ConnectionError:
print("failed to connect")
break
#If the page no longer exists and it gives me a 400
Correct entradas_IDoffer,
entradas_IDoffer = html.find_all("div", class_="aditem CardTestABClass")
Title is located under "a" tag not "div"
title = entrada.find("a", class_="aditem-detail-title").text.strip()
location = entrada.find("div", class_="list-location-region").text.strip()
content = entrada.find("div", class_="tx").text.strip()
do like this for other data
they might be loading Phone number with javascript so you may not able to get that with bs4, you can get that using selenium.
You wrote very lengthy code to loop through multiple pages, just do this to go through page 1 and 2 using range. Put url in formatted string.
for page in range(1, 3):
url = f'https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina={page}'
Full code:
import requests
from bs4 import BeautifulSoup
for page in range(1, 5):
url = f'https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina={page}'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
entradas_IDoffer = soup.find_all("div", class_="aditem CardTestABClass")
for entrada in entradas_IDoffer:
title = entrada.find("a", class_="aditem-detail-title").text.strip()
ID = entrada.find("div", class_="x5").text.strip()
location = entrada.find("div", class_="list-location-region").text.strip()
content = entrada.find("div", class_="tx").text.strip()
print(title, ID, location, content)

How to scrape the data off multiple tags with same tag name and attributes in python?

I want to extract the data from this website:
https://forecast.weather.gov/MapClick.php?lat=35.0868&lon=-90.0568
This image shows the info I wanna extract but I couldn't do it as I couldn't find a way to extract data from same tag name under the same tree...
I have successfully extracted some data before but I couldn't fetch it. Here is my code:
def weatherFetch(latitude,longitude):
URL = 'https://forecast.weather.gov/MapClick.php?'
URL = URL + 'lat=' + str(latitude) + '&lon=' + str(longitude)
print(URL)
dictionary = {
'latitude':str(latitude), 'longitude':str(longitude),
'cityName': '', 'weatherCondition': '', 'temprature': ''
}
res = requests.get(URL)
if res.status_code==200: #we have used legit coordinates
soup = BeautifulSoup(res.text, 'html.parser')
arr=soup.findAll('div', {'class': 'panel panel-default'})
if arr:
try:
cityName = arr[0].find("h2","panel-title").text
weatherCondition = arr[0].find("p", "myforecast-current").text
temprature = arr[0].find("p", "myforecast-current-lrg").text
windSpeed = arr[0].find_next("td", "text-right") #this is the line of code where i am supposed to fetch wind speed
print(windSpeed)
dictionary['cityName']=cityName
dictionary['weatherCondition'] = weatherCondition
dictionary['temprature']=temprature
except:
return dictionary
Find the element with id: current_conditions_detail
Then find all the tr tags inside the table.
for each tr tag, find td tags, there will be 2 such tags.
First one is the title and the second one is the value
You could just use pandas to get the table, then filter out the stuff you want using .loc.
Not sure what the rest of your code is trying to do. You're creating a dictionary but you only want it to return it if there's an exception??
import requests
from bs4 import BeautifulSoup
import pandas as pd
def weatherFetch(latitude,longitude):
URL = 'https://forecast.weather.gov/MapClick.php?'
URL = URL + 'lat=' + str(latitude) + '&lon=' + str(longitude)
print(URL)
dictionary = {
'latitude':str(latitude), 'longitude':str(longitude),
'cityName': '', 'weatherCondition': '', 'temprature': ''
}
res = requests.get(URL)
if res.status_code==200: #we have used legit coordinates
soup = BeautifulSoup(res.text, 'html.parser')
arr=soup.findAll('div', {'class': 'panel panel-default'})
if arr:
try:
cityName = arr[0].find("h2","panel-title").text
weatherCondition = arr[0].find("p", "myforecast-current").text
temprature = arr[0].find("p", "myforecast-current-lrg").text
df = pd.read_html(str(arr[0]))[0]
windSpeed = df.loc[df[0] == 'Wind Speed', 1][1]
print(windSpeed)
dictionary['cityName']=cityName
dictionary['weatherCondition'] = weatherCondition
dictionary['temprature']=temprature
except:
return dictionary
latitude,longitude = 35.0868, -90.0568
weatherFetch(latitude,longitude)
Output:
https://forecast.weather.gov/MapClick.php?lat=35.0868&lon=-90.0568
SW 5 mph

How to scrape embedded integers on a website

I'm trying to scrape the number of likes for the datasets available on this website.
I've been unable to workout a way of reliably identifying and scraping the relationship between the dataset title and the like integer:
as it is embedded in the HTML as below:
I have used a scraper previously to get information about the resource urls. In that case I was able to capture the last child a of parent h3 with a parent having class .dataset-item.
I would like to adapt my existing code to scrape the number of likes for each resource in the catalogue, rather than the URLs. Below is the code for the url scraper I used:
from bs4 import BeautifulSoup as bs
import requests
import csv
from urllib.parse import urlparse
json_api_links = []
data_sets = []
def get_links(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
base = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
links = [base + item['href'] if item['href'][0] == '/' else item['href'] for item in soup.select(css_selector)]
return links
results = []
#debug = []
with requests.Session() as s:
for page in range(1,2): #set number of pages
links = get_links(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-item h3 a:last-child')
for link in links:
data = get_links(s, link, '[href*="/api/3/action/package_show?id="]')
json_api_links.append(data)
#debug.append((link, data))
resources = list(set([item.replace('opendata','') for sublist in json_api_links for item in sublist])) #can just leave as set
for link in resources:
try:
r = s.get(link).json() #entire package info
data_sets.append(r)
title = r['result']['title'] #certain items
if 'resources' in r['result']:
urls = ' , '.join([item['url'] for item in r['result']['resources']])
else:
urls = 'N/A'
except:
title = 'N/A'
urls = 'N/A'
results.append((title, urls))
with open('data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Resource Url'])
for row in results:
w.writerow(row)
My desired output would appear like this:
The approach is pretty straight forward. Your given website contains required elements in a list Tag. And what you need to do, is to get source code of that <li> tag, and just fetch Heading, which has a certain class and Same goes for like count.
The catch in like count is, the text comprises of some noise. To fix that, you can use regular expression to extract digits ('\d+') from given input of likes count. Following code gives desired result:
from bs4 import BeautifulSoup as soup
import requests
import re
import pandas as pd
source = requests.get('https://data.nsw.gov.au/data/dataset')
sp = soup(source.text,'lxml')
element = sp.find_all('li',{'class':"dataset-item"})
heading = []
likeList = []
for i in element:
try:
header = i.find('a',{'class':"searchpartnership-url-analytics"})
heading.append(header.text)
except:
header = i.find('a')
heading.append(header.text)
like = i.find('span',{'id':'likes-count'})
likeList.append(re.findall('\d+',like.text)[0])
dict = {'Title': heading, 'Likes': likeList}
df = pd.DataFrame(dict,index=False)
print(df)
Hope it helped!
You could use the following.
I am using a css selector with Or syntax to retrieve title and likes as one list (as every publication has both). I then use slicing to separate titles from likes.
from bs4 import BeautifulSoup as bs
import requests
import csv
def get_titles_and_likes(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
info = [item.text.strip() for item in soup.select(css_selector)]
titles = info[::2]
likes = info[1::2]
return list(zip(titles,likes))
results = []
with requests.Session() as s:
for page in range(1,10): #set number of pages
data = get_titles_and_likes(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-heading .searchpartnership-url-analytics, .dataset-heading [href*="/data/dataset"], .dataset-item #likes-count')
results.append(data)
results = [i for item in results for i in item]
with open(r'data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Likes'])
for row in results:
w.writerow(row)

How can I make the output with pair of list : content in my python code?

I have been developing a python web-crawler for this website. I made two functions, which works well as separately.
One is to collect the list of stocks and
Another is to collect the content data of each list.
I would like to make the output of my code with pairs of
"list#1/content#1",
"list#2/content#2",
"list#3/content#3",
What needs to be modified in my code in order to achieve this?
Thanks.
from bs4 import BeautifulSoup
import urllib.request
CAR_PAGE_TEMPLATE = "http://www.bobaedream.co.kr/cyber/CyberCar.php?gubun=I&page="
BASE_PAGE = 'http://www.bobaedream.co.kr'
def fetch_post_list():
for i in range(20,21):
URL = CAR_PAGE_TEMPLATE + str(i)
res = urllib.request.urlopen(URL)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', class_='cyber')
#print ("Page#", i)
# 50 lists per each page
lists=table.find_all('tr', itemtype="http://schema.org/Article")
count=0
for lst in lists:
if lst.find_all('td')[3].find('em').text:
lst_price=lst.find_all('td')[3].find('em').text
lst_title=lst.find_all('td')[1].find('a').text
lst_link = lst.find_all('td')[1].find('a')['href']
lst_photo_url=''
if lst.find_all('td')[0].find('img'):
lst_photo_url = lst.find_all('td')[0].find('img')['src']
count+=1
else: continue
#print('#',count, lst_title, lst_photo_url, lst_link, lst_price)
return lst_link
def fetch_post_content(lst_link):
URL = BASE_PAGE + lst_link
res = urllib.request.urlopen(URL)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
#Basic Information
table = soup.find('div', class_='rightarea')
# Number, Year, Mileage, Gas Type, Color, Accident
content_table1 = table.find_all('div')[0]
dds = content_table1.find_all('dd')
for dd in dds:
car_span_t = dd.find_all('span', {'class': 't'})[0]
car_span_s = dd.find_all('span', {'class': 's'})[0]
#print(car_span_t.text, ':', car_span_s.text)
# Seller Information
content_table2 = table.find_all('div')[1]
dds2 = content_table2.find_all('dd')
for dd2 in dds2:
seller_span_t = dd.find_all('span', {'class': 't'})[0]
seller_span_s = dd.find_all('span', {'class': 's'})[0]
#print(seller_span_t.text, ':', seller_span_s.text)
return dds

Categories

Resources