View on GitHub

Emile's Notes

Data Science/Programming notes

Web Scraping with Python

Overview

Pipeline:

Setup

Acquisition

Processing

HTML

<html>
    <body>
        <div>
            <p>Hello World!</p>
            <p>Enjoy DataCamp!</p>
        </div>
        <p>Thanks for Watching!</p>
    </body>
</html>

HTML Tags and Attributes

<tag-name attrib-name="attrib info">
    element_contents
</tag-name>

Example:

<div id="unique-id" class="some_class">
    div element contents
</div>
<a href="https://www.datacamp.com">
    This text links to DataCamp
</a>

XPath

xpath = '/html/body/div[2]'
xpath = '//table'
xpath = '/html/body/div[2]//table'
xpath = '//div[@id="uid"]'
xpath = '//span[@class="span-class"]'

XPaths and Selectors

Scrapy Selector

from scrapy import Selector

sel = Selector(text=html)
sel.xpath("//p")

# outputs the SelectorList:
[<Selector xpath='//p', data='<p>Hello World!</p>'>,
 <Selector xpath='//p', data='<p>Enjoy DataCamp!</p>'>]
sel.xpath("//p").extract()

out: ['<p>Hello World!</p>',
      '<p>Enjoy DataCamp!</p>']
sel.xpath("//p").extract_first()

out: '<p>Hello World!</p>'
sel.xpath('/html').xpath('./body').xpath('./div[2]')

Example:

# Import a scrapy Selector
from scrapy import Selector

# Import requests
import requests

# Create the string html containing the HTML source
html = requests.get( url ).content

# Create the Selector object sel from html
sel = Selector( text = html )

# Print out the number of elements in the HTML document
print( "There are 1020 elements in the HTML document.")
print( "You have found: ", len( sel.xpath('//*') ) )

CSS Locators, Chaining and Responses

CSS - Cascading Style Sheets (styling html documents)

Comparing with XPath selectors:

Attributes in CSS

CSS Attributes and Text Selection

Getting the value of an attribute:

XPath –> `@attr-name

xpath = '//div[@id="uid"]/a/@href'

CSS Locator –> ::attr(attr-name)

css_locator = 'div#uid > a::attr(href)'

Text Extraction:

XPath –> text()

sel.xpath('//p[@id="p-example"]/text()').extract()

CSS Locator –> ::text

# Text within immediate child
sel.css('p#p-example::text').extract()

# Text within all future generations (blank space before '::text')
sel.css('p#p-example ::text')

Response Objects

Response objects have all the tools we learned with Selectors. xpath and css methods allow us to query the html document and extract, extract_first methods allow us to isolate selected data.

A response object also keeps track of the URL that the HTML code was loaded from (response.url), and helps us move from one site to another (response.follow(next_url)), in order to crawl and scrape the web.

Spiders

Creating a Spider:

import scrapy
from scrapy.crawler import CrawlerProcess

class SpiderClassName(scrapy.Spider):
    name = "spider_name"

    # code for spider
    ...

process = CrawlerProcess()
process.crawl(SpiderClassName)
process.start()

Basic Structure of a Spider:

class DCSpider(scrapy.Spider):

    name = 'dc_spider'

    def start_requests(self):
        urls = ['https://datacamp.com/courses/all']
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        # simple example: write out html
        html_file = 'DC_courses.html'
        with open(html_file, 'wb') as fout:
            fout.write(response.body)

Parsing and Crawling

Saving links to file:

class DCSpider(scrapy.Spider):
   name = "dcspider"

   def start_requests(self):
      urls = ['https://www.datacamp.com/courses/all']
      for url in urls:
         yield scrapy.Request(url=url, callback=self.parse)

   def parse(self, response):
      links = response.css('div.course-block > a::attr(href)').extract()
      filepath = 'DC_links.csv'
      with open(filepath, 'w') as f:
         f.writelines([link + '/n' for link in links])

Following links to parse individual pages

class DCSpider(scrapy.Spider):
   name = "dcspider"

   def start_requests(self):
      urls = ['https://www.datacamp.com/courses/all']
      for url in urls:
         yield scrapy.Request(url=url, callback=self.parse)

   def parse(self, response):
      links = response.css('div.course-block > a::attr(href)').extract()
      for link in links:
         yield response.follow(url=link, callback=self.parse2)
   
   def parse2(self, response):
      # parse individual course sites

The branching structure of being able to follow links within pages and further links within subpages is where the name spider comes from, as we branch out into a web of pages to be scraped.

Building an full spider

import scrapy
from scrapy.crawler import CrawlerProcess

class DC_Chapter_Spider(scrapy.Spider):
   
   name = "dc_chapter_spider"

   def start_requests(self):
      url = 'https://www.datacamp.com/courses/all'
      yield scrapy.Request(url=url,
                           callback=self.parse_front)

   def parse_front(self.response):
      # Narrow in on the course blocks
      course_blocks = response.css('div.course-block')
      # Direct to course links
      course_links = course_blocks.xpath('./a/@href')
      # Extract links
      links_to_follow = course_links.extract()
      # Follow links to the next parser
      for url in links_to_follow:
         yield response.follow(url=url,
                               callback=self.parse_pages)
   
   def parse_pages(self, response):
      # Direct to the course title
      crs_title = response.xpath('//h1[contains(@class, "title")]/text()')
      # Extract and clean course title text
      crs_title_ext = crs_title.extract_first().strip()
      # Direct to chapter titles
      ch_titles = response.css('h4.chapter__titles::text')
      # Extract and clean the chapter titles text
      ch_titles_ext = [t.strip() for t in ch_titles.extract()]
      # Store data in dictionary
      dc_dict[crs_title_ext] = ch_titles_ext

dc_dict = dict()

process = CrawlerProcess()
process.crawl(DC_Chapter_Spider)
process.start()