当前位置:博客首页 > SEO > 正文

Python文章原创度检测脚本【亲测有效】

作者: Jarvan 分类: SEO 发布时间: 2020-07-02 12:56 百度已收录

执行原理:

1)连接数据库,读取指定数量的文章,并按照标点符号,将文章拆分成句子(找出符合条件的句子,我这里设置的是超过12个字符的句子,抽取2条)

2)将上述句子分别通过百度搜索接口进行查询,查找搜索结果top10中完整包含该句子的数量,如果句子的重复度低于30%(1个句子是3条重复,2个句子就是6条重复),也就是低于6个句子重复,那就提示内容重复度较低,符合要求

备注1:在:本脚本同路径下放置cookie.txt,可以放入多个百度账号登录后的cookie,轮循使用,以防止爬取频次过快,而被禁止访问

备注2:可以考虑使用动态IP进行爬取,可以大大降低无法正常抓取的问题,我这里使用的是阿布云,1条IP隧道1元/小时,自己可以根据自己需求自行考虑是否使用

#coding:utf-8

import requests,re,time,sys,json,datetime
import multiprocessing
import pymysql as mdb
import re
from random import choice

current_date = time.strftime('%Y-%m-%d',time.localtime(time.time()))
cookie = [line.strip() for line in open('cookie.txt', encoding='utf-8')]

def search(req,html):
    text = re.search(req,html)
    if text:
        data = text.group(1)
    else:
        data = 'no'
    return data

def date(timeStamp):
    timeArray = time.localtime(timeStamp)
    otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
    return otherStyleTime

def getHTml(url):

    host = search('^([^/]*?)/',re.sub(r'(https|http)://','',url))
    headers = {
        "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding":"gzip, deflate, sdch",
        "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
        "Cache-Control":"no-cache",
        "Connection":"keep-alive",
        "Cookie":choice(cookie),
        "Host":host,
        "Pragma":"no-cache",
        "Upgrade-Insecure-Requests":"1",
        "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
    }

  # 代理服务器
    proxyHost = "http-dyn.abuyun.com" # 使用的阿布云动态隧道
    proxyPort = "9020"

    # 代理隧道验证信息
    proxyUser = "xxxxx"
    proxyPass = "xxxxx"

    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
      "host" : proxyHost,
      "port" : proxyPort,
      "user" : proxyUser,
      "pass" : proxyPass,
    }

    proxies = {
        "http"  : proxyMeta,
        "https" : proxyMeta,
    }

    html = requests.get(url,headers=headers,timeout=30)
    # html = requests.get(url,headers=headers,timeout=30,proxies=proxies)
    code = html.encoding
    status_code = html.status_code

    # print(status_code)
    # time.sleep(1) # 防止抓取过快而被限制
    return html.content


def getContent(word):

    pcurl = 'http://www.baidu.com/s?q=&tn=json&ct=2097152&si=&ie=utf-8&cl=3&wd=%s&rn=10' % word
    print('start crawl %s' % word)
    html = getHTml(pcurl)

    a = 0
    try:
        html_dict = json.loads(html)
        for tag in html_dict['feed']['entry']:
            if 'title' in tag:
                title = tag['title']
                url = tag['url']
                rank = tag['pn']
                time = date(tag['time'])
                abs = tag['abs']
                abs = re.sub(r',|,|?|\?|!|!|。|:|:', '', abs, re.I|re.S) # 去除搜索结果文章对应的摘要里面的标点符号,防止一些伪原创文章断句

                if word in abs:
                    a += 1
    except json.decoder.JSONDecodeError:
        print('>>>> 抓取页面错误')
        a = a
        getContent(word)
    finally:
        return a


con = mdb.connect('127.0.0.1','root','root','seo',charset='utf8')
cur = con.cursor()
with con:
    cur.execute("select pid,duanluo from luke_caiji limit 30") # 限制调用文章的数量
    numrows = int(cur.rowcount)
    for i in range(numrows):
        row = cur.fetchone()

        aid = row[0]
        content = row[1]
        content_format = re.sub('<[^>]*?>','',content)

        a = 0
        list_sentence = re.split(r',|,|?|\?|!|!|。|:|:', content_format)
        for z in [ x for x in list_sentence if len(x)>12 ][:2]: # 此处限制每篇查询几句
            a += getContent(z)
        if a <=6:
            print("%s --> %s 【文章重复度较低,可考虑采用】\n" % (aid,a))
        else:
            print("%s --> %s\n" % (aid,a))


# words = open(wordfile).readlines()
# pool = multiprocessing.Pool(processes=10)
# for word in words:
    # word = word.strip()
    # pool.apply_async(getContent, (word,client ))
# pool.close()
# pool.join()

本文代码及思路参考的是闯哥的文章,《SEO文章原创度检测》,闯哥用的是python2,我这里改成python3,并就一些参数和异常处理做了优化