有没有合适的PYTHON抓字典的案例可参考？

stiggg · 发表于 2020-9-13 15:19:03

懂点PYTHON,想学学前人怎么爬字典的。目前只看到BT4BAIDU大大的开源，大大的代码很典范，可是代码四五年了，似乎网站也改版，具体细节的作用比较难揣摩了。不知道还有其它的比较好可参考学习？

毛小驴 · 发表于 2020-9-13 18:58:34

顶起

你去哪里 · 发表于 2020-9-13 22:19:26

我也想知道这个。

你去哪里 · 发表于 2020-9-13 22:19:38

我也想知道这个。

scirem · 发表于 2021-1-4 02:18:57

#!/usr/bin/env python3- T* {" w4 W9 Q9 R6 k+ F
# -*- coding: utf-8 -*-
1 I( X9 ~0 o: H0 m1 q
) I: y5 J- ~/ e+ K, e( r. m& e
import os: a: H9 M9 ?# d, b& w
import requests
4 W5 D; l1 C; g
import re! x3 G q* W8 ]6 o5 q
import time
" P7 [% I" T0 I0 w8 y. ]5 F1 h
import sqlite3
5 A9 I# ~( M' V1 \
- C* ~4 a/ {+ O
from string import ascii_lowercase
! R# U$ e* m% u) g
4 H: Q; p' |1 @/ t* a# [9 Q
from pathlib import Path
& T' t5 }; S \2 X" S% r! y- I
from urllib.parse import urljoin+ w. m# c4 d1 w/ y; ]5 e: ~' E
from html.parser import HTMLParser9 c9 W0 ?+ d1 o8 W' e I
from bs4 import BeautifulSoup
) C$ Z1 P7 ^' P# r/ ?
6 }, N/ i _) t% x3 p! U& I
webster_url = "https://www.merriam-webster.com/browse/dictionary/"
5 s9 j: f& Z8 U# e
oxford_url = "https://www.oxfordlearnersdictionaries.com/browse/english/"5 t/ z( A* H A5 m
macmillan_url = "https://www.macmillandictionary.com/browse/british/"
0 d9 p4 C, K3 n# ?7 J
cambridge_url ="https://dictionary.cambridge.org/browse/english/"+ @5 E4 g# ~# v% G7 p* I; T
$ O4 h# s W- J) j, f* {
cambridge_english_chinese_url = "https://dictionary.cambridge.org/browse/english-chinese-simplified/"
; [6 S* u6 h& T! ]( H8 A
+ A3 n1 x% b6 f# g! P
base_url = "https://dictionary.cambridge.org/search/direct/"
% \7 B4 M/ Q* E+ {5 f- y
header = {"User-Agent": "Chrome/87.0.4280.88 Safari/537.36"}
) L# T' `: G5 J f n7 e
payload = {'datasetsearch': 'english-chinese-simplified', 'q': ""}6 |1 T5 o5 G# S6 B0 ]: y" H% u3 t
}! x) l# \; Q
conn = sqlite3.connect("Cambridge Advanced Learner's English-Chinese Dictionary.db")* C$ A2 ~' m% u5 T
4 `2 W5 d7 L" ~
conn.execute("""
' E2 z- s* F, V! j A+ V) Z4 p
CREATE TABLE IF NOT EXISTS cambridge (; d3 u/ i# z W
entry TEXT PRIMARY KEY( ?$ }+ H: j* D4 E8 Y$ X" K- a
UNIQUE
3 a' C& c$ a4 c8 o2 q# ^$ L$ z
NOT NULL,
. u8 v+ F& f- s4 M. p
url TEXT NOT NULL,
6 H/ U) ]' B) i/ g; b8 ~* }
html TEXT NOT NULL,
4 u- u$ l0 o$ @+ ~; d, j! k1 ]
latex TEXT* ?9 b | f& z+ m b8 d
);3 t7 k* H: Q6 K5 q! K" r
""")
& n8 w. r: U" d9 O- l# O; Y1 [, k
1 N/ m+ d' \& F7 F; F1 L* i! h$ |
conn.commit()
z; i' k( q6 [
1 K2 G8 `9 \: O o8 c
" e+ i' u0 O/ \9 h" K1 O$ @
def latexify(string):" G Z0 [; y0 N- h& n
result = ""
4 E2 y; E+ ^9 n6 f! j0 u, b
trimmed_string = re.sub(r"\s{2,}", " ", string)7 {& t: C3 s. I
for char in re.sub(r"\s+([^\w'"({[])", r"\1", trimmed_string):
, I( G4 Y! `9 H& ~; g$ b
if char == "%":
1 ?2 v% \ U( B
result += r"\%"8 ^0 `6 K4 V3 h% f$ I6 w
elif char == "\":3 l- P' B5 L3 o' ^) i
result += r"\textbackslash{}"7 k- S' Y# y7 M% w6 [ x
elif char == "$":
$ v0 J: v4 X' ]: M4 z2 w5 K
result += r"\$"6 Z' w- W/ T3 B- v' k* b+ l
elif char == "#":
; m- `) t' G+ ?$ _. d8 R- O- ~
result += r"\#"
& ]! c; o& Z) ?8 N" l
elif char == "&":
$ q; p7 |8 I6 K2 k
result += r"\&": J3 H" `3 f6 H; @
elif char == "{":
4 E9 }! {" Q ?: D7 [, e) X) M
result += r"\{"3 l) B7 X/ V; |( N# O' K" v
elif char == "}":1 z# B0 W4 t* _( [% E9 t2 S0 M! {
result += r"\}"
1 X+ E) M8 r. g6 N2 S4 G7 b# s
elif char == "^":
+ p, f" q$ `7 U
result += r"\^"
6 O: Y" y1 t3 G3 z, S2 A
elif char == "_":
4 U: Y; r/ M4 F9 t' k
result += r"\_"0 z8 S& }. m9 i. O% d) o
elif char == "~":
. G, r- P5 k: V U3 N; }
result += "r\textasciitilde{}"
. Y" ]* t* b6 a( S% ]
else:8 z8 R7 I2 |. c/ ?: m G
result += char& L7 w& g2 i* X4 S+ q' t$ I
return result
: ^ c7 G: z, H# }# t, @
f- i, o e' V" S
def latexify_html(beautifulsoup_object):8 q4 F! T4 ^# \, ^% t+ k
try:
3 Z" z* h) \7 r# h
return latexify(re.sub(r"\s{2,}", " ", beautifulsoup_object.get_text().replace("\n", " ")).strip())
# b9 W, Z9 T1 e n
except:
* ^5 \( ~# t- e$ n
return latexify(beautifulsoup_object)
* B/ B0 I& N6 @& ]: c) n2 n
" w9 u" X' f& D3 j0 z2 b# L$ m C
4 z3 r. s7 s/ U h w4 j1 f
class CambridgeDictionaryScraper:" {% i5 v7 Q* P. [* ]9 v& P
""" Scraper for Cambridge Dictionary """
# D5 A& A0 X0 F/ r6 X) D5 K+ m) z
url_set = set() ## Shared by all instance2 P" z L) s# W
6 M8 }" ?* o2 q' O4 g) L( R ]
def __init__(self):$ E) [6 B+ c7 _
for item in conn.execute("SELECT url FROM cambridge;"):; A/ }6 q5 A; k1 Z
self.url_set.add(item[0])
$ C2 C& c2 D7 p9 Z+ U
print("Already Downloaded " + str(len(self.url_set)) + " Words!")
. e( B+ |2 h' D# i& C
7 k/ Y: f- y, X# \ s0 G
$ r+ p2 o3 L' I% R. z- d
def __del__(self):
; b- [# O0 P0 H& b! O
conn.commit()
$ q5 v/ J |, _& y( Q$ F- A% X, o
( _& r" X0 d( f# q. U' A: x
$ B. j, U$ D1 l2 s" r% _4 v0 l
def get_word_page(self, url):7 a" G, m; j$ W; h1 Y4 [7 J9 S
r = requests.get(url, headers=header) " d) D4 N$ s$ X* }% e2 k
bs_obj = BeautifulSoup(r.text, "html.parser")* U$ ^; W3 t! |+ I! Y
entry_tag = bs_obj.find("div", {"class": "entry"})* v! e* I' ~9 B; {3 V: L! e# o
if not entry_tag:
! {) ^% [1 J. V& D) O* o* |
entry_tag = bs_obj.find("div", {"class": "di-body"})
# y8 i i. V. `% P; m
if not entry_tag:6 `2 @, Y2 x; C- Y5 T, P
## Beta Words
- {+ L6 _1 m4 {* l; ^! A* n
entry_tag = bs_obj.find("div", {"id": "entryContent"})4 X) F3 u: y8 g0 _! u
if not entry_tag:
' c6 ^, J W" @% i$ x
entry_tag = bs_obj c4 p3 b& b+ z
$ n3 o( J4 ^* c [$ H' K
if not entry_tag:
2 f0 |4 |0 W$ `" S8 k' \3 P
entry_tag = bs_obj6 a3 P0 I0 U6 E5 d1 Y) Q& W
8 _% h6 P3 f( i
for tag in entry_tag.find_all("script"):
6 T! Q1 |. ^; _5 W9 Z( u" B* J6 o
tag.extract()
* _8 \7 R9 @5 u5 J) b6 Q: n
# R/ h% ?# ]; ]/ `5 P/ x
result_string = str(entry_tag)* r- }) o' n. x
return result_string
6 ^0 l2 ^0 G6 y" E1 F6 k/ B
c+ p3 Q B1 j- F+ v
`5 x6 S; ]3 t* }* a4 e
def start(self, url):/ {; \3 n6 {4 j
r = requests.get(url, headers=header)
1 i1 O7 |2 c* k0 U
. x3 z; T& Q( a U3 e# b. H! V
bs_obj = BeautifulSoup(r.text, "html.parser")
; W+ h ]2 D" ^1 v! @& }3 ^6 T
7 i" \! S' [% V, d8 u4 Q
for li_tag in bs_obj.select("li.lpr-10"):1 M( n5 h: l" K6 b
child_url = urljoin(cambridge_url, li_tag.a.attrs["href"]); V. |( f: H* d: g. f! t5 O( v
print(child_url)& l3 ]' H/ R* k
self.find_child_entry(child_url) y8 h) q) ]% V1 w
( O& O1 B8 H2 @4 c6 g8 d
1 l! W7 k' K; ?2 m% \: T `
def find_child_entry(self, url):8 u$ u0 @$ c) [
r = requests.get(url, headers=header)
8 Z1 ?. H+ r9 j* Z* A& [- O" s
bs_obj = BeautifulSoup(r.text, "html.parser")
" W4 L- {+ ?+ R, T2 X) q
for li_tag in bs_obj.select("li.t-i"):9 T7 }. |" Z" w9 p: f
child_url = urljoin(url, li_tag.a.attrs["href"]).strip()
) T: `' N& A j' i2 U1 L
child_text = li_tag.get_text().strip()
Q* X" J8 J6 F. S9 J9 [* k8 h
if "..." in child_text:. B: A5 T7 X8 p0 O' g! f0 Q
self.find_child_entry(child_url)
2 {4 h2 V+ K4 a3 S+ G; z: g
else:, H4 u" U5 ]! ]" e
if child_url in self.url_set:& t6 P$ ?) }! h( O- {- n2 @
continue
2 k5 P0 y( ~& j- J0 j+ y- K
print(child_text + "\t" + child_url)
; P1 `0 B' f* M0 `6 [ Z6 V" O4 a
conn.execute("INSERT INTO cambridge (entry, url, html) VALUES (?, ?, ?);",
: _# R, p9 J/ M% b3 \/ q
(child_text, child_url, self.get_word_page(child_url)))1 M$ M! @: x7 f, e7 d v
conn.commit(): H, S9 L4 l$ g# W- k+ Z! \
self.url_set.add(child_url)' x: t5 h# t& b* S) ]
7 T6 |& A' j- R/ a
' N- L; \# I3 a
class CambridgeDictionaryExtractor():
: Z: s2 w6 [% B- m% v$ D# x
def __init__(self, entry, entry_html = ""):
, D. v2 T7 z) g. \- x# {
self.entry = latexify(entry). P2 w) ?' v8 u* M" H7 p
self.entry_html = entry_html
6 H* p9 U4 h& c
self.result = ""
! r1 J) F b4 h7 K; I
" Z, S: o: [" t$ L5 f& C7 b
# def __del__(self):! o4 \# V; w$ ]; \
# pass/ t7 S* w9 W; H( a! A
' i# m7 i1 D9 @ @
8 f$ J5 k; K0 `' Z9 x
def extract(self):
% N1 n% L$ h9 F4 J
"""" V/ j' f R4 D0 U& J* [3 x
<div class="pr idiom-block">5 A1 b. L ^( n" o7 p
<div class="idiom-block"></div>; X/ v9 j' ^- Z% Y: x8 v$ J! J* l
</div>
$ ^ h9 V0 y/ R7 D1 d
"""" Y- w1 b$ W9 r7 L1 F; O* V
bs_obj = BeautifulSoup(self.entry_html, "html.parser"), T2 B: F7 u) p9 I* X0 _* a: \
self.result += "\\begin{entry}{" + self.entry + "}"% E+ w3 _% P4 F/ W/ l V
for part_of_speech_tag in bs_obj.find_all("div", {"class": "entry-body__el"}):8 ~" D& I. Y7 Y7 p- h p3 j0 l
self.result += "\n\\begin{Partofspeech}\n" + self.process_part_of_speech(part_of_speech_tag) + "\n\\end{Partofspeech}"
3 Q+ }1 G; Z7 |7 X3 Y
idiom_block = bs_obj.find("div", {"class": "idiom-block"})+ j H6 u0 N3 ` g6 u1 H( X
if idiom_block:; R" O2 M: `/ `0 L( [
for idiom_block in idiom_block.find_all("div", {"class": "idiom-block"}):
; J+ k$ u: R6 y* [2 R) O
self.result += "\n\\begin{idiom}" + self.process_idiom_block(idiom_block) + "\n\\end{idiom}"7 Q, ?# c+ O7 A% ]$ h
self.result += "\n\\end{entry}\n\n"" d; i" S8 A; m8 h9 G8 T
* K8 j, M5 a5 T. N1 I: Y: A3 M
+ I6 _( m! |1 p6 Q3 e8 _3 V# i, E
def process_idiom_block(self, idiom_block):
& W6 D' z8 Z* h8 m1 p) W
result = "", h2 x# m! H! p4 F, k( e
idiom_body = idiom_block.find("span", {"class": "idiom-body"})
2 y8 g/ j+ X" ]9 K( A
if idiom_body:1 X: U8 Y+ g5 n
for sense_tag in idiom_body.find_all("div", {"class": "dsense"}):
J4 K* w) V, ~" e6 R1 Q! T! E" _- q
result += self.process_sense_tag(sense_tag)) ^& u8 Z' t5 h) `3 I$ o* L( I
return result7 ~* o0 t6 H( ]
3 V% D+ O% ^$ ^) K- J+ p
; e2 b( S; X D- L: w
, j# v0 D5 e6 R# \9 l, r1 Z
def get_smart_vocabulary(self, smart_vocabulary_tag):
# B g1 E3 Q: ^* e! }+ g0 P H
result = ""; M1 g4 A/ _2 C3 Y! d+ p5 F
for li_tag in smart_vocabulary_tag.find_all("li"):; M/ W% A9 _0 L" L5 E( y
result += "\\smart{" + latexify_html(li_tag) + "}\n"
+ z' e" S3 S7 K
return "\n\\begin{smartvocabulary}\n" + result + "\\end{smartvocabulary}\n"% n) S" p0 ^" ]+ I% D
' E# r4 U" r$ ]# G7 i
2 c6 I, U0 W3 m$ t- l5 P
def process_part_of_speech(self, part_of_speech_tag):+ D& Y6 o6 u# E) ^4 @3 m
"""
7 \& c0 _ _5 v3 }
<div class="entry-body__el">4 [: P. L- l0 W7 n% V
<div class="pos-header"></div>* b+ { S% l) H) F! ^
<div class="pos-body"></div># M2 n5 e+ D* X3 P
<div class="pr relativDiv"></div>
* [/ C3 Q" b+ w( T# n7 T
<div>4 x0 B4 A( x2 G4 i4 e
"""
. A+ o1 B- F6 g( _
result = "", f9 E5 Z1 `% }3 a3 S
header_tag = part_of_speech_tag.find("div", {"class": "pos-header"})
$ A2 j' S5 ^; D: [' i0 j, R
if header_tag:
" d, l& W! c. N9 s& I
result += self.process_part_of_speech_header(header_tag)
# `. L. S0 _5 @$ ^# q' p- W' ]
body_tag = part_of_speech_tag.find("div", {"class": "pos-body"})
7 b. }( W, E( j/ L% i6 L
if body_tag:
8 U! `" J' C! f7 ]4 [5 N- e G
result += self.process_part_of_speech_body(body_tag)- i h! C, ^2 D% p4 C, T
pv_block_tag = part_of_speech_tag.find("div", {"class": "pv-block"})
* _2 p5 F: e8 Q) t
if pv_block_tag:
; l u( ]! _# z' e1 O, o4 L# M
result += self.process_pv_block(pv_block_tag)
) k: a/ I a1 m, U
return result.strip()
; o+ S3 r5 X" s3 b+ A
8 w: z9 g$ g$ f& ?, k6 n2 |
* f( P/ e0 M3 F* e; p& G9 e
def process_pv_block(self, tag):
4 O$ D: y; m* y2 j. v' ` d
"""
( h$ E+ K* T3 L! J
<div class="pv-block">
+ e) v7 Z1 X; M1 T- T& W
<div class="di-title"></div>: z Y% r5 [5 v6 X
<span clss="di-info"></span>% X1 J2 E* ]5 ~; E
<span class="pv-body dpv-body">4 {7 _& K* c2 [$ J$ Q- M
<div class="pr dsense dsense-noh">1 w; [( P N8 n/ |
<span>
* x+ }- G, t0 c: D$ M
<div>4 e7 J7 y, w$ Z/ o: c, t
"""
7 Q, U2 H) [: {, } S; x9 I; I; i
result = ""
+ m4 w: z: Q1 l
for item in tag.find_all("div",{"class", "sense-body"}):
$ P, ^7 D. y4 r+ g* o! [
result += self.process_sense_body(item)
1 z6 C! E- N% V: Y: i3 W
return result
' z/ Q0 o/ _9 n3 _8 L+ b8 r
/ V, F' T/ V7 k& H/ I. u
" P2 W) A' [* n. y
2 D* N" b) U* g$ s( C7 _7 s
def process_part_of_speech_header(self, header_tag):+ @* I0 y! _1 x+ ~ n, q
result = ""
) A! [, l ~9 @+ l3 r4 h
# title_tag = header_tag.find("div", {"class": "di-title"})
5 T+ j/ _/ F& ?/ ^' q; Q+ S
# if title_tag:
: h+ L2 E( u/ q' o$ t3 \: {8 y3 W9 f, Q
# result += process_header_title(title_tag)* W7 b( ]9 H. X X
posgram_tag = header_tag.find("div", {"class": "posgram"})
8 O7 M3 \% n7 Z- J
if posgram_tag:
4 u( S/ }9 U: m, o
result += self.process_part_of_speech_grammar(posgram_tag)
/ G7 J$ r; p* O& g
for pronunciation_tag in header_tag.find_all("span", {"class": "dpron-i"}):! R0 n B$ Z: e2 y2 b# g- ~: L
result += self.process_pronunciation(pronunciation_tag)
' ~5 v9 N, W$ Q. P& b. S
! \6 s1 S' e3 R9 T3 ~
return result.strip()
X8 M- P. A8 I! ]! ^9 X
7 S, I1 c4 n) E) O: G' d& R
. n; w" P9 ~% w) V& Y. M8 e
def process_header_title(self, title_tag):
5 n4 H' C. f- J6 f
## <span class="hw dhw">record</span>
0 Q! G8 K5 e3 D y& L
result = "": w$ Y1 o/ V9 |8 A7 E% z" D
headword_tag = title_tag.find("span", {"class": "hw"})
! \8 Z" \$ r6 v& i+ c* q J/ P
if headword_tag:8 O8 X1 d6 d, d' m. S
result += "\\entry{" + latexify_html(headword_tag) + "}\n"
- @4 h; h1 i% Q5 e+ a0 @* E/ C! z% j
else:, B9 p; [0 E8 V- e. ]- W
result += "\\entry{" + latexify_html(title_tag) + "}\n"
$ u" M4 w f# S3 ^
return result
) {9 T# H0 w5 s1 ~: x
4 ]; }& O/ _* N5 n5 x
def process_part_of_speech_grammar(self, posgram_tag):" O9 C9 ~# S Y) X# a3 _1 z, K
result = ""8 w \1 u7 L* T+ d6 B
part_of_speech_tag = posgram_tag.find("span", {"class": "pos"})
9 r7 V* h" a: ?% e
if part_of_speech_tag:
- u* V( h3 }& l& J* E; E
result += "\n\\pos{" + latexify_html(part_of_speech_tag) + "}"
/ H8 h5 o; y+ j6 [* r
gram_tag = posgram_tag.find("span", {"class": "gc"})
! @6 ]3 t" t( X( v0 x6 C
if gram_tag:5 t: R8 k: n, e! N& _
result += "\n\\posgram{" + latexify_html(gram_tag) + "}"$ u7 `! S/ N8 I% p
return result/ J2 ~4 i! l6 I& f% S
- N! x; z: i: }) {& v8 s# p, Y0 Z
def process_pronunciation(self, pronunciation_tag):. P( z% P" Y- j% i1 x: X6 g* H
is_us_pronunciation = False
) ^# }# h4 i5 k1 f% h, `0 X
if "us" in pronunciation_tag.attrs["class"]:
8 ]+ u( O$ I# B
is_us_pronunciation = True" u. z9 P& t1 d7 t$ v
result = ""6 h7 e$ I9 @5 Y e |7 B. l
audio_tag = pronunciation_tag.find("source", {"type": "audio/mpeg"})4 W/ [" S5 `* U9 X& R, Z8 u9 P8 ^
5 ^ `5 s* C1 L% T" W' y) @
ipa_tag = pronunciation_tag.find("span", {"class": "ipa"})
, ~2 ~' o. F% N. v& x3 i% }
if ipa_tag:( |% }1 q7 M- x0 ]: b
if is_us_pronunciation:
. D V) W2 N6 Q* E- v: x
result += "\n\\ipaus{" + latexify_html(ipa_tag) + "}"
1 B) C+ W* P/ p) f: W5 v
else:. ~( R- r8 ~: X5 Y
result += "\n\\ipauk{" + latexify_html(ipa_tag) + "}"
* a( k) ?$ O6 [. T/ l8 V) o( S/ y
if audio_tag: H; x# t5 Q1 P" o, J+ i- v% [
audio_url = urljoin("https://dictionary.cambridge.org/", audio_tag.attrs["src"])* e1 j% p" y; s6 Q. ?
result += "\n\pronuniation{" + audio_url + "}"# D" i, W# p$ i& x5 N# Y. i) v
return result
0 O5 K7 b- z2 T
1 e/ i* g7 b5 k5 @. ~/ O- p; c
4 t/ y1 \( I- S0 k
def process_sense_head(self, tag):0 r2 d' g+ V6 V- v9 G3 _
text = latexify_html(tag)
8 i9 c: Y- K/ L) A( _2 F I% p
if "(" in text:
' @# O5 h% ?+ h9 t' F6 y7 x
left_bracket_index = text.index("(")
4 d* H6 N' S d/ M# X
else:* f' h6 ^5 }5 y0 q+ V) j3 N
left_bracket_index = 0) |1 R& B% P4 g; z8 P9 I: W
if ")" in text:
& g; L6 A& S% |
right_bracket_index = text.index(")")) C% W0 k* C, e* I, v
else:
" \1 {6 G2 U! l7 Y# c! H; e. y
right_bracket_index = len(text)( k8 r5 r3 j3 I5 |5 l7 P2 V
return "\n\\shortmeaning{" + text[left_bracket_index + 1: right_bracket_index].strip() + "}"
" r! U" R' }2 J9 ]0 o
4 s& P& Q+ Z( A; d1 ]3 @0 l0 j+ U
( O1 v! z8 I7 p0 q9 f
def get_definition(self, tag):
) q- n6 k& r: C) z
result = "") t; ~5 B! j1 I: G" W# U* l
for def_info_tag in tag.select("span.def-into"):+ O5 V/ G# t% J, J- N* P/ i
result += latexify_html(def_info_tag)9 ~' O0 h) i6 [4 _: e1 r
for def_tag in tag.select("div.def"):
. @( { o! L [
result += latexify_html(def_tag)% t% z4 [( b: B" `/ l& i" m
return result
# |# k: L& z9 k$ f+ ~/ u
/ ]$ w+ b/ S& z9 y
0 I/ L; s* Z* Y% S) w; \9 l
def process_def_block(self, tag):
7 m. o% a2 `2 f4 a* k p7 i
result = ""
- o; A. C `' i$ K
def_tag = tag.find("div", {"class": "ddef_h"}): q3 m7 _/ H; ]' ]3 f8 U
if def_tag:4 Y y" h6 V' f, m1 s1 A' Z
result += "\n\\meaningen{" + self.get_definition(def_tag) + "}"
9 W6 L* @. o4 }! w a, f* M
try:3 g( F8 _) l6 \, e) I
def_trans_tag = tag.select("span.trans")[0]2 [% }- D6 S4 C7 v" O' W9 H
if def_trans_tag:" Y! n% x! w0 C) t- {' e; F) P3 y
result += "\n\\meaningcn{" + latexify_html(def_trans_tag) + "}". p/ u3 n! @* S% n
except: |( S; Z, {& D* ~9 K _
pass
8 A) i P/ \! l% X, u5 G/ Z3 t# \
for example_tag in tag.select("span.eg"):
( S# z3 l: m! g0 n" U! c# P% n
result += "\n\\example{" + latexify_html(example_tag) + "}"
3 j& r' ^. C' U+ L/ K: L4 O
return result% ]* E. `: [3 V+ j8 P
" h) y% u6 P, f$ u6 G5 {+ D* N9 a
9 W" r. g+ j6 s. `
def process_phrase_block(self, phrase_block_tag):
9 ~6 B2 u2 x4 c
"""; _+ v J* \' ?
<div class="phrase-head dphrase_h">...</div>
/ F2 { U n7 ~3 z1 p
<div class="phrase-body dphrase_b">...</div>- Y4 R) V+ ?2 O) x: e
<div class="bb hax">...</div>
$ ]2 k* v0 K7 v8 V. X
"""
3 b) v' _, | ~1 W
result = "\\begin{phrase}{" `! F! u+ _$ ?& B6 T- ?8 g) S
result += phrase_block_tag.select("span.phrase-title")[0].get_text().strip() + "}"
B0 ?4 e3 N+ C$ ]
return result + "\\end{pharse}\n"
5 w- ^5 S. A& S1 Z# k. G
' l3 @# g. ]6 H# w9 G
def process_sense_body(self, tag):) ^1 P" D n, X( v# E
"""
1 ^; C8 z* p k0 z2 }
<div class="pr phrase-block dphrase-block">...</div>; z5 I" `6 B9 p$ K- K
<div class="def-block ddef_block">...</div>
$ s- @6 f& |) f- R' D
<div class="bb hax">...</div>$ W1 M( i. z. S. a0 c: ?
"""9 a7 ~% ]) v4 a
result = ""
8 t+ y* X, g, F1 S o1 o% i
for def_block in tag.select("div.def-block"):
# {8 P2 k- i/ x, j4 r
result += self.process_def_block(def_block)4 ? w# J7 B( D
for phrase_block in tag.select("div.pharse-block"):8 f! P* x% C2 p/ @
result += self.process_phrase_block(phrase_block)4 t$ H) ?# ^( ?. f
return result$ f3 f- e& q& {) Q$ ~( }* j
9 R2 N5 M1 w" O$ h
def process_sense_tag(self, sense_tag):
. o' w6 p+ h+ f+ w- \( j
"""* V& _$ _* H* a6 J
<h3 class="dsense_h">...</h3>
4 P6 L1 Z/ {, D( |
<div class="sense-body dsense_b">...</div>& D2 Y* @7 _: i3 g. [8 V% V% L
<div class="smartt daccord">...</div> # Smart Vocabulary0 j" [# y3 T8 p) x( h
<div class="bb hax">...</div>3 k- B7 r( |' c3 g
"""0 W6 @5 C" P7 ~
result = ""1 R9 l& H C$ R; J! @, e$ B
sense_head_tag = sense_tag.find("h3", {"class": "dsense_h"})
5 K1 ^+ z% v, s# ]9 k, \/ J
if sense_head_tag:
" l3 P3 Z0 ^ ?- i: @) {/ v
result += self.process_sense_head(sense_head_tag), v' l( O) j- O* ?- Y M
for sense_body_tag in sense_tag.select("div.sense-body"):
! j" f$ | |% x! D
result += self.process_sense_body(sense_body_tag)) R! m- S+ f$ v/ R8 R
for smart_vocabulary_tag in sense_tag.find_all("div", {"class": "smartt"}):# {) O4 D1 M) l
result += self.get_smart_vocabulary(smart_vocabulary_tag)
. }5 B" { d$ h1 t# l4 i+ o
( M8 x4 `* Q+ W- \' b9 }
return result# k; U0 U8 H. G" @- L0 H' F' b; z
# {5 X9 m# T+ q" f7 P9 t
def process_part_of_speech_body(self, body_tag):
! U7 W# k! f# L* Y( \
"""
5 S. N) O2 r) r$ G X
<div class="pr dsense">...</div>
" P2 h* q' u! k" s
<div class="pr dsense">...</div>
7 q7 {- ^7 P3 N" Q9 x9 z% e4 U) |
"""
% z* ^9 h* A2 F& R
result = ""
3 X4 s! z6 r4 c5 f- k
for sense_tag in body_tag.select("div.dsense"):2 _' z/ \+ m) X C4 p
result += self.process_sense_tag(sense_tag)
6 b# l1 _9 s* K0 T
return result
% r6 f6 d, @' s
7 w1 W/ L; u7 L* Z4 p
! a: o& S& _, T: N W* p% s' D+ K
if __name__ == "__main__":
; z, i2 q: s0 o
string = ""
" Z1 I$ a0 r) a( m$ n2 H1 v! }
CambridgeDictionaryScraper().start(cambridge_english_chinese_url)* w/ V8 }/ R- J8 M: O9 s% e2 j
for row in conn.execute("SELECT entry, url, html FROM cambridge;"):
8 y) T1 @# |1 N- h
entry = row[0]' ]1 K: z, w. Y2 {) m
print(entry)
/ T/ N% w: G; M6 S1 Y, o. A
url = row[1]+ G) w# _8 I6 a5 m) y
html = row[2] H' K% s! E. {4 {1 c8 Z
record = CambridgeDictionaryExtractor(entry, entry_html=html)
' e! V1 c# ~0 I' y
record.extract()
$ A) w& V' Q' ?, o
string += record.result + "\n"
0 }$ g9 Q5 Q/ x
#print(record.result)' x% v8 q# q7 [& I! j* ]5 H
y6 V! w* C) @0 k4 ~% O5 q, Z
with open("./final.tex", "w", encoding="utf-8") as f:' w1 [$ O _& D
try:9 U! l& G1 N0 `+ `8 F
for char in ascii_lowercase:
3 F$ B$ S7 D3 O ]+ w4 O2 j* `
string = string.replace("\\begin{entry}{" + char, "\\section{" + char + "}\n\n\\begin{entry}{" + char, 1)
p4 K1 l. _5 }% H) U# Q
except:
) I( w7 F* _+ ]+ u
pass! ]) B5 ~0 c6 f$ h. K
% m/ H( l8 {) n& J' h6 g
f.write(string.replace("", ""))
2 H) ~% M2 j% N6 t" t" O5 N5 d
9 { V% i* Z9 s0 Z/ B3 j* z
4 A: b1 k r# \7 t; Z

复制代码

		自动登录	找回密码
密码			免费注册

[求助] 有没有合适的PYTHON抓字典的案例可参考？