当前位置:博客首页 > Python > 正文

第20课 lxml之CSS选择器

作者: Jarvan 分类: Python 发布时间: 2019-06-20 13:20 百度已收录

lxml 主要有两种处理HTML的方式:
1. cssselect
2. xpath

  1. 使用html解析HTML
    • from lxml import html
    • 使用CSS选择器选择元素
      • doc = html.fromstring(源码字符串)
        html.tostring(ele) # 将Element对象转成字符串
      • 返回的是Element类,该类有以下属性:
      • ‘addnext’, ‘addprevious’, ‘append’, ‘attrib’, ‘base’, ‘base_url’, ‘body’, ‘classes’, ‘clear’, ‘cssselect’, ‘drop_tag’, ‘drop_tree’, ‘extend’, ‘find’, ‘find_class’, ‘find_rel_links’, ‘findall’, ‘findtext’, ‘forms’, ‘get’, ‘get_element_by_id’, ‘getchildren’, ‘getiterator’, ‘getnext’, ‘getparent’, ‘getprevious’, ‘getroottree’, ‘head’, ‘index’, ‘insert’, ‘items’, ‘iter’, ‘iterancestors’, ‘iterchildren’, ‘iterdescendants’, ‘iterfind’, ‘iterlinks’, ‘itersiblings’, ‘itertext’, ‘keys’, ‘label’, ‘make_links_absolute’, ‘makeelement’, ‘nsmap’, ‘prefix’, ‘remove’, ‘replace’, ‘resolve_base_href’, ‘rewrite_links’, ‘set’, ‘sourceline’, ‘tag’, ‘tail’, ‘text’, ‘text_content’, ‘values’, ‘xpath’

举例1:css选择器获取百度搜索热点关键词

html.fromstring(source) # 将字符串转成Element类,就可以使用Element类相关方法及属性,如.cssselect()

html.tostring(source) # 将Element类转成字符串,查看源码

# -*- coding: utf-8 -*-
from lxml import html
import requests

def download():
    url = 'https://www.baidu.com/s?wd=seo'
    headers = {
        "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/68.0.3440.106 Safari/537.36'
    }
    try:
        resp = requests.get(url,headers=headers,timeout=10)
    except requests.RequestException:
        source = None
    else:
        source = resp.text
    return source

if __name__ == '__main__':
    source = download()
    doc = html.fromstring(source)   # 将字符串转成Element类
    # print(doc,dir(doc))
    top = doc.cssselect('table.opr-toplist1-table tr')
    for ele in top:
        # print(html.tostring(ele, encoding='utf-8').decode('utf-8')) # 将Element类转成字符串,查看源码

        # texts = []
        # for e in ele.itertext():
        #     hastext = e.strip()
        #     if hastext:
        #         texts.append(hastext)
        # print('-'.join(texts))

        print('-'.join(e.strip() for e in ele.itertext() if e.strip()))  #类三元表达式的方法与上述注释掉的方法效果等同

cssselect选择器返回的是一个list

text属性返回的是该元素的文本内容,假如该元素下没有文本信息,或者文本信息是包含在其它标签里面的,那么就会返回None

itertext是一个方法,返回的是一个迭代器,需要通过循环遍历才能获取全部的内容。这个方法能够获取该元素下的所有文本内容,包含其里面的标签的文本信息

remove(element) 删除某个元素

make_links_absolute(base_url) # 将相对链接补全为绝对链接

# -*- coding: utf-8 -*-
from lxml import html
import requests


def download(url, retires=3):
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible;Baiduspider-render/2.0; "
                      "+http://www.baidu.com/search/spider.html)"
    }
    try:
        resp = requests.get(url, headers=headers, timeout=10)
    except requests.RequestException:
        source = None
        if retires > 0:
            return download(url, retires - 1)
    else:
        resp.encoding = 'utf-8'
        source = resp.text
    return source


if __name__ == "__main__":
    url = "http://www.wenzhangba.com/qingganwenzhang/list_2_2.html"
    source = download(url)
    doc = html.fromstring(source)
    pages = doc.cssselect(".page a")
    for page in pages:
# make_links_absolute(base_url) # 将相对链接补全为绝对链接
        page.make_links_absolute(url)
# 通过.attrib获得的是以属性和属性值的字典,可以通过字典的.get方法获取想要的值
        print(page.attrib.get('href'))

下面是一个完整的多线程采集列表页获取文章url,经过集合去重后,多线程采集文章的示例

# -*- coding: utf-8 -*-
from lxml import html
import requests
from threading import Thread
from queue import Queue
import time


class Downloader:
    def download(self, url, retires=3):
        headers = {
            "User-Agent": "Mozilla/5.0 (compatible;Baiduspider-render/2.0; "
                          "+http://www.baidu.com/search/spider.html)"
        }
        try:
            resp = requests.get(url, headers=headers, timeout=10)
        except requests.RequestException:
            source = None
            if retires > 0:
                return self.download(url, retires - 1)
        else:
            resp.encoding = 'utf-8'
            source = resp.text
        return source


# 只负责采集列表页面的,所以提供给我的,就是列表页的url
# 采集列表页之后会提取出文章的url,所有要有一个队列来存储文章的url
class SWListSpider(Thread, Downloader):
    seens = set()

    def __init__(self, host, list_queue, art_queue):
        Thread.__init__(self)
        self.list_queue = list_queue
        self.art_queue = art_queue
        self.base_url = host

    def run(self):
        while True:
            url = self.list_queue.get()
            if url in self.seens:
                self.list_queue.task_done()
                continue
            self.seens.add(url)
            print("Download: {}".format(url))
            source = self.download(url)
            # 这里需要注意:只有通过类名修改雷属性的时候,才会全部的实例都能看到
            if source is None:
                self.list_queue.task_done()
                continue
            self.extract_list(source)
            self.list_queue.task_done()

    def extract_list(self, source):
        doc = html.fromstring(source)
        art_list = doc.cssselect('#conts .cmt_info .tit a')
        list_urls = doc.cssselect('.page a')
        for item in list_urls:
            item.make_links_absolute(self.base_url)  # 补全链接
            list_url = item.attrib.get('href', '')
            if list_url.endswith('.html') and list_url not in self.seens:
                # 列表页的url必须是.html结尾的,同时没有被抓取过
                self.list_queue.put(list_url)

        for ele in art_list:
            art_url = ele.attrib.get('href', '')
            if art_url.endswith('.html'):
                # 这里提取到的是文章的url
                # 所以要把文章的url放到文章的url队列里面去
                self.art_queue.put(art_url)


# 只负责采集文章,所以提供给我的就是文章的url
class SWArticleSpider(Thread, Downloader):
    def __init__(self, art_queue):
        super().__init__()
        self.url_queue = art_queue

    def run(self):
        while True:
            url = self.url_queue.get()
            print("Download: {}".format(url))
            source = self.download(url)
            if source is None:
                self.url_queue.task_done()
                continue
            self.extract_article(source)
            self.url_queue.task_done()

    @staticmethod
    def extract_article(source):
        doc = html.fromstring(source)
        title = doc.cssselect('h1 a')[0].text
        content = doc.cssselect('div.a_detail')[0]
        content.remove(content.cssselect('#content_article')[0])
        content = ''.join(content.itertext()).strip()
        with open('articles/{}.txt'.format(title), mode='w',
                  encoding='utf-8') as f:
            f.write(content)


if __name__ == "__main__":
    list_queue = Queue()
    art_queue = Queue()
    start_url = 'http://www.wenzhangba.com/qingganwenzhang/'
    list_queue.put(start_url)

    for i in range(5):
        ls = SWListSpider(start_url, list_queue, art_queue)
        ls.setDaemon(True)
        ls.start()
        time.sleep(0.1)

    for i in range(10):
        sw = SWArticleSpider(art_queue)
        sw.setDaemon(True)
        sw.start()
        time.sleep(0.1)  # 每个线程延时100ms

    list_queue.join()
    art_queue.join()
    print("Done.")
# 字典可以通过.get获取键值对,.get第二个参数是如果值为空,则默认为第二个参数
list_url = item.attrib.get('href', '')

# .endwith('str'),可以用来判断字符串是否以'str'结尾
if list_url.endswith('.html')

发表评论