one two three
  • one
  • two
  • three


# -*- coding: utf-8 -*-

import re
import urllib
from BeautifulSoup import BeautifulSoup

html = '''

def main():
  soup = BeautifulSoup(html)

  # 基本はfindかfindAllでタグ名指定で要素を取得
  links = soup.findAll('a')         
  for link in links:
    print                 # タグ名
    print link.string               # タグの中のテキスト
    print dict(link.attrs)['href']  # attrsはタプルのリストなので辞書経由でアクセスが便利

  blogdiv = soup.find('div', attrs={'id':'blog'})  # タグ名に加えてattrsで属性条件を入れる
  bloglinks = blogdiv.findAll('a')                 # 要素に対してもfind/findAll可能。子要素からの検索になる
  for link in bloglinks:
    print link  # 要素をそのまま出力すると、その部分のHTMLになる

  # タグ内のテキストで検索
  firstlinks = soup.findAll(text="one")            
  for link in firstlinks:
    print link.parent  # テキストで検索した場合はテキストオブジェクトが取れる。タグにアクセスしたい場合はparent経由

  # findAllのタグ条件はリストで複数指定可能
  li_or_a = soup.findAll(['a', 'li'])              
  for tag in li_or_a:
    print tag

  # テキスト検索も複数指定可能
  one_or_two = soup.findAll(text=['one', 'two'])   
  for tag in one_or_two:
    print tag

  # recursiveを無効にすれば、自身の子要素からのみ検索できる
  only_children = soup.find('body').findAll('div', recursive=False)
  for div in only_children:
    print div

  # 各属性の条件指定には正規表現オブジェクトを指定できる
  dot_com_links = soup.findAll('a', attrs={'href':re.compile(r'.*?\.com')})
  for link in dot_com_links:
    print dict(link.attrs)['href']

  # タグ名とテキスト内容を同時に指定しても、タグは取れない
  firstlinks = soup.findAll('a', text="one")
  for link in firstlinks:
    print link.parent  # やっぱりparentを経由する必要がある

if __name__ == '__main__': main()



2 thoughts on “BeautifulSoupでスクレイピングのまとめ”

  1. Thanks for bringing up this topic. I was searching for
    up to date information on this subject for a couple of days,
    I discovered only this page Now I’m
    satisfied because I have finally reached your post.
    I like how you present and argue all the facts in addition to your overall writing style.
    Sometimes, there’s a lack of time to read long pieces, but yours is
    brief and succinct, I spent just a few minutes to read the whole article.
    It’s essential since no one has enough time to read.

  2. I’m so thankful I came across your article! It took me weeks to search for the information you have mentioned above
    and it’s a genuine blessing to find someone as curious about this topic as myself.

    What I am attempting to say is the significance
    of this problem cannot be hesitated. People who raise it are worth appreciation despite
    the fact that I, personally, do not fully share your own views.
    Anyhow, thank you for sharing your experience!

Leave a Reply

Your email address will not be published. Required fields are marked *