Selenium

Charlie Zhang

9/20/2021

Introduction

Classical Requests and BeautifulSoup

Suppose we need to scrape Paraguay’s Comptroller General (Contraloría General de la República)
- The url is https://portaldjbr.contraloria.gov.py/portal-djbr/
- Here is how the website looks like:

Python Code

# Import packages
import requests
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore")

try: 
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
        }
    url = "https://portaldjbr.contraloria.gov.py/portal-djbr/"
    r = requests.get(url, headers= headers)
    r.status_code 
except requests.HTTPError as e:
    print(e)
    print("HTTPError")

except requests.RequestException as e:
    print(e)

except:
    print("Unknown Error!")
## HTTPSConnectionPool(host='portaldjbr.contraloria.gov.py', port=443): Max retries exceeded with url: /portal-djbr/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)')))

A Simple Example

Suppose we want to know what are the first-page result of searching georgetown dspp:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

# Initiate the driver
driver = webdriver.Chrome("/Applications/chromedriver")
url = "https://www.google.com"
driver.get(url)

## Insert the key value and search
search = driver.find_element_by_name("q")
search.send_keys('georgetown dspp')
search.submit()

## Find the url of each result
results = driver.find_elements_by_xpath('//div[@class="yuRUbf"]//a[@href]')
for result in results:
    print(result.get_attribute("href"))

## Quit the driver
## https://mccourt.georgetown.edu/master-of-science-in-data-science-for-public-policy/
## https://mccourt.georgetown.edu/msdspp-request-info/
## https://andykgreen.com/
## https://www.reddit.com/r/georgetown/comments/hbatt5/how_good_is_the_msdspp_degrees_data_science/
## https://www.instagram.com/georgetownmccourtschool/?hl=en
## https://math.sciences.ncsu.edu/2018/09/27/why-your-students-should-study-data-science-and-public-policy-at-georgetown-2/
## https://www.linkedin.com/in/andykgreen
## https://en.m.wikipedia.org/wiki/McCourt_School_of_Public_Policy
## https://mccourt.georgetown.edu/people/staff/
driver.quit()

More Practical Scenarios

import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import urllib.request
import time


def getFBpost(id= None, pwd= None, page= None):
  
    # Initiate the driver
    service = Service("/Applications/chromedriver")
    service.start()
    driver = webdriver.Remote(service.service_url)
    driver.get('http://www.facebook.com')

    # Get and insert the email and passport
    email = driver.find_element_by_id("email")
    password = driver.find_element_by_id("pass")
    email.send_keys(id)
    password.send_keys(pwd)
    password.submit()

    # Have a glimpse of the page
    time.sleep(5)

    # Generate and Get the url
    url = "https://www.facebook.com/" + str(page)
    driver.get(url)

    # Use JS to execute selenium 
    for x in range(1, 5):
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
        time.sleep(3)

    # Use bs to get the page content
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Get the posts
    titles = soup.find_all("div", {"class": "kvgmc6g5 cxmmr5t8 oygrvhab hcukyx3x c1et5uql ii04i59q"})

    with open("posts.txt", 'w') as file: 
      for title in titles:
          post= title.find('div', {'dir': 'auto'})
          file.write(post.getText()+ "\n")

    driver.quit()
 

Cont’d

with open("id.txt", "r") as f1, open("pwd.txt", "r") as f2:
  id = str(f1.readlines()[0])
  pwd = str(f2.readlines()[0])

if __name__ == "__main__":
    getFBpost(id = id,
              pwd = pwd,
              page = "georgetownuniv")
              
with open("posts.txt", "r") as file:
  print(file.readlines()[0])
              
## President Biden has formally announced the nomination of McCourt Professor Dr. Adriana Kugler for U.S. Executive Director of the International Bank for Reconstruction and Development, part of the World Bank Group. The U.S. Executive Director represents the United States and its interests on the Board of Directors of the World Bank Group, a leading financier of basic health, education, infrastructure, environmental, governance and anti-corruption programs in the developing wor… See More

Improve Efficiency through Multithreading


Selenium can be efficient! Be sure to include time.sleep() or other functions in your code.

Original Version

import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from selenium import webdriver
import timeit

def getTitle(url = None):
    
    r = requests.get(url)
    soup = BeautifulSoup(r.text,"html")
    titles = [urljoin(url,items.get("href")) for items in soup.select(".summary .question-hyperlink")]
    
    driver= webdriver.Chrome("/Applications/chromedriver")
    driver.get(titles[0])
    
    content = BeautifulSoup(driver.page_source)
    item = content.select_one("h1 a").text
    
    return print(item)

if __name__ == "__main__":
  url = "https://stackoverflow.com/questions/tagged/web-scraping"
  getTitle(url= url)           
## Python Scraping: Is there anyway that I can scrape this line of code using beautifulsoup? (Finding content of a tag that contain specific tag/string)

I use %timeit, and the result is 1.14 s ± 171 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Use Multithreading

import requests
from urllib.parse import urljoin
from multiprocessing.pool import ThreadPool, Pool
from bs4 import BeautifulSoup
from selenium import webdriver
import threading

def getLinks(link):
  r = requests.get(link)
  soup = BeautifulSoup(r.text,"html")
  titles = [urljoin(url,items.get("href")) for items in soup.select(".summary .question-hyperlink")]
  return titles

threadLocal = threading.local()

def setUp():
  driver = getattr(threadLocal, 'driver', None)

  if driver is None:
    options = webdriver.ChromeOptions()
    options.headless= True
    driver = webdriver.Chrome(options = options)
    setattr(threadLocal, 'driver', driver)

  return driver


def getTitles(url):
  
  driver = setUp()
  driver.get(url)

  content = BeautifulSoup(driver.page_source)
  item = content.select_one("h1 a").text
  
  print(item)

if __name__ == '__main__':
  url = "https://stackoverflow.com/questions/tagged/web-scraping"
  ThreadPool(5).map(getTitles,getLinks(url))
## Error while webscraping data from glassdoor using python
## How can I download specific message from Voov meeting to excel file [closed]
## How to scrape text from a hidden element?
## Is there a possibility to show the IP address used in each request?
## Python Scraping: Is there anyway that I can scrape this line of code using beautifulsoup? (Finding content of a tag that contain specific tag/string)
## While using Python Scrapy library, Response.css is showing empty list while fetching div data?
## web scrapping a table without a class or Id
## Threadpool using more threads than asked to?
## C# HtmlAgilityPack olx scraping [closed]
## How to download static files when scraping a website using Java JSOUP
## BeautifulSoup find_all that 'Kind of match'
## Multithreading - web-scrapping - Python
## TimeoutError Web Scrapping Python
## Not able to scrape entire table using pd.read_html
## Can't fix an error caused by cheerio while using axios
## Auto login to Amazon Using PHP to Scrape Data
## How to Scrape data from chart with R
## Access link href using BeautifulSoup in Python
## How can I scrape html of any website, find specific values and manipulate, then send back data to client side?
## R: webscrapping returns no applicable method for 'xml_find_all' applied to an object of class "character"?
## How can I take down the scraped Stack Overflow post from a third-party website? [duplicate]
## Can't apply the logic of grabbing next pages within a function
## Fix error in a For Loop used to extract reviews of a product given different urls
## Beautifulsoup4 find_all not getting the results I need
## I am starting web-scraping with python and
## Unable to sendkey on a password field
## Scrape facebook page like using Selenium
## I need to connect HMA VPN using python
## browser tab closing in selenium python
## Obtain data of Freight Index in python
## Extract content from html dat saved in a text file using Beautifulsoup
## getting curve values from an interactive figure using scrapy python
## urllib.error.HTTPError: HTTP Error 404: Not Found for yahoo finance scrape
## Scraping Data Using Requests and Beautifulsoup
## How to store Knowledge Graph?
## interagtion of scrapper.py to flask
## Javascript-Resolve promise to HTML
## How to get data past the "Show More" button that DOESN'T change the URL?
## Unable to parse an image link from a webpage using requests
## how to scrape a description from a website
## Product Data Scraping API Information [closed]
## How to Scrape multiple images from a website with Python [closed]
## How to scrape web table that has rows within rows?
## IMPORTXML Formula in Google Sheets
## Scraping frames in R without RSelenium?
## How to scrape a directory full of .html files using scrapy?
## Is there a way to scrape information from tippy on a webpage, utilizing only bs4 and requests module? [closed]
## Python selenium, why my text can't be located by WebDriverWait.until? Text could be located by page_source.find but not find_elements_by_xpath
## Script gets stuck somewhere in it's execution process
## BeautifulSoup: extract nth child in a list
## [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]

Where I change the code?

The run time is 18.723 s, which is at least 3.04 times faster than before.

Selenium for Testing

I haven’t used Selenium for testing, and it is pretty similar to the code displayed before.

import unittest
from selenium import webdriver

class SearchText(unittest.TestCase):
    def setUp(self):
        # create a new Firefox session
        self.driver = webdriver.Chrome("/Applications/chromedriver")
        self.driver.implicitly_wait(30)
        self.driver.maximize_window()
        # navigate to the application home page
        self.driver.get("http://www.google.com/")

    def test_search_by_text(self):
        # get the search textbox
        self.search_field = self.driver.find_element_by_name("q")

        # enter search keyword and submit
        self.search_field.send_keys("Selenium WebDriver Interview questions")
        self.search_field.submit()

        #get the list of elements which are displayed after the search
        lists = self.driver.find_elements_by_xpath('//div[@class="yuRUbf"]//a[@href]')
        self.assertEqual(10, len(lists))

    def tearDown(self):
        # close the browser window
        self.driver.quit()


if __name__ == '__main__':
    unittest.main(exit= False)
## <unittest.main.TestProgram object at 0x7f92cadecd60>
## 
## F
## ======================================================================
## FAIL: test_search_by_text (__main__.SearchText)
## ----------------------------------------------------------------------
## Traceback (most recent call last):
##   File "<string>", line 20, in test_search_by_text
## AssertionError: 10 != 14
## 
## ----------------------------------------------------------------------
## Ran 1 test in 4.906s
## 
## FAILED (failures=1)

Conclusion