scapy爬虫-考研英语词汇

之前都是用go语言爬虫的,学习了一点python,尝试着用scrapy框架爬虫

爬虫之前需要大致了解一下scarpy框架,官方文档实在看的累,找了一个中文翻译的简要了解了一下,地址是(http://www.scrapyd.cn/doc/165.html)

爬取的扇贝网的英语词汇(https://www.shanbay.com/wordbook/34/), 虽然网页访问是要登录的,但是实际爬虫幷不需要。

  • 1.扇贝网的文档结构十分统一
  • 2.右上角的搜索框,搜一个单词,很容易就通过调试获取到单词信息接口
  • 3.接口有次数限制,最好控制一下频率或者使用代理

于是,根据以上两点,很容易就能做到这个爬虫。而且由于结构的一致性,稍加修改,就能扩展到所有词汇的爬虫。

就英语考研词汇而言,内容有限,直接将最后的结果存储到json文件就可以了

具体代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding: utf-8 -*-
import scrapy
import time
import json
import urllib3
import random


class WordbookSpider(scrapy.Spider):
wl = []
failedwds = []
name = 'wordbook'
allowed_domains = ['www.shanbay.com']
book = '34'
file = None
tmp_fp = None
succ_fp = None
successws = []
proxys = [
"http://124.207.82.166:8008"
]
proxyManagers = []

def randproxy(self):
return random.choice(self.proxyManagers)

def start_requests(self):
url = "http://www.shanbay.com/wordbook/"
book = getattr(self, 'book', None) # 获取tag值,也就是爬取时传过来的参数
if book is not None:
self.book = book
url = url + self.book # 构造url
file = getattr(self, 'file')
if file is None:
self.file = "shanbei_wordbook_" + self.book + ".json"
else:
self.file = file
tmpfile = "shanbei_wordbook_" + self.book + "_tmp.json"
self.tmp_fp = open(tmpfile, 'a', encoding='utf-8')

successfile = "shanbei_wordbook_" + self.book + "_success.txt"
self.succ_fp = open(successfile, 'a', encoding='utf-8')

# self.successws = self.succ_fp.readlines()

for proxy in self.proxys:
try:
if proxy == "":
pool = urllib3.PoolManager(num_pools=5)
else:
pool = urllib3.ProxyManager(proxy_url=proxy, num_pools=5)
except Exception as e:
print("can not conn:", proxy, e)
continue
self.proxyManagers.append(pool)

yield scrapy.Request(url, self.parse) # 发送请求爬取参数内容

def parse(self, response):
ll = response.xpath('//*[@id="wordbook-wordlist-container"]')
for l in ll:
wa = l.css('a::attr(href)').extract()
for w in wa:
next_page = response.urljoin(w) + "?page=1"
yield scrapy.Request(next_page, callback=self.parsewds)

# 避免一次性操作失败,可以分文件存储或者每一次查找到结果后存储到临时文件,最后统一处理格式
def parsewds(self, response):
ll = response.xpath('/html/body/div[3]/div/div[1]/div[2]/div/table')
wl = ll.xpath("//td[@class='span2']/strong//text()").extract()
for w in wl:
if w not in self.successws:
# print(w)
self.searchword(w)
time.sleep(0.001)
# pass
if len(wl) > 1:
time.sleep(1)
ss = response.url.split('page=')
page = int(ss[len(ss)-1])
next_page = ss[0] + "page=" + str(page+1)
yield scrapy.Request(next_page, callback=self.parsewds)

def searchword(self, w, first=True):
r = self.randproxy().request("GET", self.makesearchpath(w), retries=2)
resp = json.loads(r.data, encoding="utf-8")
# resp = requests.get(self.makesearchpath(w)).json(encoding="utf-8")
print(resp)
if resp['status_code'] == 0:
data = resp['data']
data['word'] = w
self.succ_fp.write(w)
self.succ_fp.write("\n")
self.wl.append(data)
json.dump(data, self.tmp_fp, ensure_ascii=False)
self.tmp_fp.write(",\n")

else:
if first:
self.searchword(w)
else:
self.failedwds.append(w)
print(data)

def makesearchurl(self, w):
tm = int(time.time()*1000)
url = "/api/v1/bdc/search/?version=2&word={}&_={}".format(w, tm)
return url

def makesearchpath(self, w):
tm = int(time.time()*1000)
path = "http://www.shanbay.com/api/v1/bdc/search/?version=2&word={}&_={}".format(w, tm)
return path

def close(self, spider, reason):
self.wl.sort(key=lambda w: w['word'].lower())
fp = open(self.file, 'w', encoding='utf-8')
json.dump(self.wl, fp, ensure_ascii=False, indent=4)
print("failed words", self.failedwds)
fp.close()
super().close(spider, reason)

运行

1
scrapy  crawl wordbook -a book=34

全部过程

1
2
3
4
5
6
7
8
9
10
#### 安装包
pip install scrapy
#### 创建项目
scrapy startproject xxx
#### 生成爬虫文件
scrapy genspider -t basic wordbook https://www.shanbay.com/wordbook/
#### 写代码

#### 执行
scrapy crawl wordbook -a book=34