Webスクリピングサンプルコード

大変だった。。。待機が重要。
# -*- coding: utf-8 -*-

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time

RETRY = 5

def select_by_xpath(wd, xpath, text):
    # nameで指定
    em = wd.find_element_by_xpath(xpath)
    # selectタグの値をテキストで指定
    em = Select(em)
    # optionsでselect選択肢を取得可能
    #for i in em.options:
    #    print(i.text)
    em.select_by_visible_text(text)


def click_by_xpath(wd, xpath):
    # elementがクリック可能になるまで待つ必要あり。そうしないと以下エラーが不定期に出る。
    # selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
    # http://www.seleniumqref.com/api/python/conditions/Python_presence_of_element_located.html
    # 結局waitとループで対処
    cnt = 1
    wait = WebDriverWait(wd, 30)
    while True:
        if cnt >= RETRY:
            print('Error: click_by_xpath failed: ' + wpath)
            exit()
        try:
            em = wait.until(expected_conditions.visibility_of_element_located((By.XPATH, xpath)))
            em = wait.until(expected_conditions.element_to_be_clickable((By.XPATH, xpath)))
            em.click()
            break
        except:
            print('Warn: click_by_xpath retryed: ' + xpath)
            cnt += 1
            time.sleep(0.5)


def click_by_xpath_with_scroll(wd, xpath):
    # 読み込み失敗したらスクロールダウン
    cnt = 1
    wait = WebDriverWait(wd, 30)
    while True:
        if cnt >= RETRY:
            print('Error: click_by_xpath_with_scroll failed: ' + wpath)
            exit()
        try:
            em = wait.until(expected_conditions.visibility_of_element_located((By.XPATH, xpath)))
            em = wait.until(expected_conditions.element_to_be_clickable((By.XPATH, xpath)))
            em.click()
            break
        except:
            print('Warn: click_by_xpath_with_scroll retryed: ' + xpath)
            wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            cnt += 1
            time.sleep(0.5)


def get_last_page(wd):
    time.sleep(5)
    xpath = '//a[@ng-click="setCurrent(pageNumber)" and @class="ng-binding"]'
    em = wd.find_elements_by_xpath(xpath)
    return em[-1].text


def scraping():
    options = webdriver.ChromeOptions()
#    options.add_argument('--headless')
#    options.add_argument('--window-size=1280,1280')
    wd = webdriver.Chrome(options=options)
    wd.implicitly_wait(20)
    wd.get('https://access.redhat.com/errata')

    # プロダクト選択
    # ボタンを押してからでないと、選択肢を選べない
    click_by_xpath(wd, '//div[@class="col-sm-3"]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]')
    click_by_xpath(wd, '//li[text()="Red Hat Enterprise Linux"]')

    # OS選択
    click_by_xpath(wd, '//div[@class="col-sm-3 chosen-wrapper" and contains(@ng-hide, "portal_product_variant")]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]')
    click_by_xpath(wd, '//li[text()="Red Hat Enterprise Linux Server"]')

    # バージョン選択
    click_by_xpath(wd, '//div[@class="col-sm-3 chosen-wrapper" and contains(@ng-hide, "portal_product_version")]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]')
    click_by_xpath(wd, '//li[text()="6"]')

    # アーキテクチャ選択
    click_by_xpath(wd, '//div[@class="col-sm-3 chosen-wrapper" and contains(@ng-hide, "portal_architecture")]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]')
    click_by_xpath(wd, '//li[text()="x86_64"]')

    # 表示行を最大に
    select_by_xpath(wd, '//select[@ng-model="pageSize"]', '100')

    # 最後のページ数を取得
    last_page = get_last_page(wd)

    # テーブル情報取得
    for i in range(int(last_page)):
        i = i + 1
        xpath = '//a[@ng-click="setCurrent(pageNumber)" and @class="ng-binding" and text()="' + str(i) + '"]'
        click_by_xpath_with_scroll(wd, xpath)
        html = wd.page_source
        bs = BeautifulSoup(html, 'html.parser')
        table = bs.findAll('table', {'id': 'DataTables_Table_0'})[0]
        rows = table.findAll("tr")
        for row in rows:
            for cell in row.findAll('span', {"class": "cell-content"}):
                print(cell.get_text())
        time.sleep(1)
    wd.quit()


if __name__ == '__main__':
    scraping()
pikesaku’s blog

個人的な勉強メモです。記載内容について一切の責任は持ちません。

Webスクリピングサンプルコード