More Related Content
Similar to Lispmeetup #56 Common lispによるwebスクレイピング技法 (20)
More from Satoshi imai (8)
Lispmeetup #56 Common lispによるwebスクレイピング技法
- 5. 4 日経新聞から現在の日経平均株価
を取得
(ql:quickload :dexador)
(ql:quickload :plump)
(ql:quickload :clss)
(ql:quickload :cl-ppcre)
(defparameter article-html (dex:get "http://www.nikkei.com/markets/kabu/"))
(defparameter parse-tree (plump:parse article-html))
(defparameter sub-tree (aref (clss:select "span.mkc-stock_prices" parse-tree) 0))
(print (plump:text (aref (plump:children sub-tree) 0)))
- 6. 4.1 同じことをPYTHONでやろうとすると
import urllib.request
from bs4 import BeautifulSoup
url = "http://www.nikkei.com/markets/kabu/"
response = urllib.request.urlopen(url)
data = response.read()
soup = BeautifulSoup(data, "html.parser")
span = soup.find_all("span")
nikkei_heikin = ""
for tag in span:
try:
string_ = tag.get("class").pop(0)
if string_ in "mkc-stock_prices":
nikkei_heikin = tag.string
break
except:
pass
print(nikkei_heikin)
- 7. 5 ロイターの記事から本文を取得
(defparameter article-html (dex:get "http://jp.reuters.com/article/idJPL3N0U325520141219"))
(defparameter body-class
(aref (nth-value 1 (ppcre:scan-to-strings "(ArticleBody_body_.*?)"" article-html)) 0))
(defparameter parse-tree (plump:parse article-html))
(defparameter sub-tree (aref (clss:select (format nil ".~A" body-class) parse-tree) 0))
(defun node-text (node)
(flet ((cat (strs) (reduce (lambda (s1 s2) (concatenate 'string s1 s2)) strs)))
(let ((text-list nil))
(plump:traverse node
(lambda (node) (push (plump:text node) text-list))
:test #'plump:text-node-p)
(cat (nreverse text-list)))))
(print (node-text sub-tree))
- 9. 6.1 連番でない画像を上から順番にダウンロー
ド(2)
.boxesクラスの部分木を取ってきて、さらにIMGタグを探し、
URLでフィルタをかける
(defparameter article-html (dex:get "http://logofaves.com/"))
(defparameter parse-tree (plump:parse article-html))
(defparameter sub-trees (clss:select "img" (aref (clss:select ".boxes" parse-tree) 0)))
(defparameter urls
(remove-if-not
(lambda (url)
(cl-ppcre:scan "^http://logofaves.com/wp-content/uploads/" url))
(map 'list (lambda (node)
(gethash "src" (plump:attributes node)))
sub-trees)))
(loop for i from 0
for url in urls
do (dex:fetch url (format nil "/tmp/logo-~3,'0d.jpg" i)))