有没有合适的PYTHON抓字典的案例可参考？

stiggg · 发表于 2020-9-13 15:19:03

懂点PYTHON,想学学前人怎么爬字典的。目前只看到BT4BAIDU大大的开源，大大的代码很典范，可是代码四五年了，似乎网站也改版，具体细节的作用比较难揣摩了。不知道还有其它的比较好可参考学习？

毛小驴 · 发表于 2020-9-13 18:58:34

顶起

你去哪里 · 发表于 2020-9-13 22:19:26

我也想知道这个。

你去哪里 · 发表于 2020-9-13 22:19:38

我也想知道这个。

scirem · 发表于 2021-1-4 02:18:57

#!/usr/bin/env python3
* z6 @4 S5 ? H9 y6 n
# -*- coding: utf-8 -*- k; d7 O/ `0 T2 E ?: _
- O @9 @3 ]2 W- o7 u8 O
import os
; c' p5 [' I7 l0 n* z7 X6 P
import requests
* N v, |+ R; J1 I
import re: n: h0 g$ J" D3 c$ \0 l; Q
import time
8 X, j0 T- I7 g: `
import sqlite3* _+ {2 | [3 v6 f
, R' @- u$ d- C+ [
from string import ascii_lowercase ! B- n. ?; ^* g/ o- G
" ?2 {! {# q% P. f! {# j" E' u
from pathlib import Path
+ x& V5 b% E/ O" U2 G4 o
from urllib.parse import urljoin8 f1 @, k2 }! I* X
from html.parser import HTMLParser
; k% X3 ]( a3 V9 E7 ?
from bs4 import BeautifulSoup
* P* f4 ?, @. n t& d+ g
: A: y( P! o4 R+ z0 ]
webster_url = "https://www.merriam-webster.com/browse/dictionary/"
+ {0 I6 z6 P: }& F- z
oxford_url = "https://www.oxfordlearnersdictionaries.com/browse/english/"1 r% x: L# Z* \+ {' E
macmillan_url = "https://www.macmillandictionary.com/browse/british/"
' u J* k* z9 W0 i' `$ K
cambridge_url ="https://dictionary.cambridge.org/browse/english/"4 [; `- `' O) @! P7 y! ~- n
* P k( b5 N" D; E" Y
cambridge_english_chinese_url = "https://dictionary.cambridge.org/browse/english-chinese-simplified/"
" \) J8 E8 n) z5 W; b+ S; V
! j7 A+ u( _2 Q! i: S* [. |9 A5 ~
base_url = "https://dictionary.cambridge.org/search/direct/"4 b4 N q6 O& W( t
header = {"User-Agent": "Chrome/87.0.4280.88 Safari/537.36"}
& A$ _+ E' z: A0 i; t0 ?. N( g
payload = {'datasetsearch': 'english-chinese-simplified', 'q': ""}% }- z9 c6 ^" v
; s" e4 O: G5 [+ q
conn = sqlite3.connect("Cambridge Advanced Learner's English-Chinese Dictionary.db")
7 {/ s% H5 J! N
: M- U) n7 c9 X* _
conn.execute("""
( ~! O$ p% C9 b6 R+ O2 g% R. T
CREATE TABLE IF NOT EXISTS cambridge (/ K3 B5 L( y- w& N1 i/ s$ {- D
entry TEXT PRIMARY KEY
! k- v' W0 c" F2 q- |
UNIQUE$ v r6 M7 w& g( g0 c
NOT NULL,/ p0 a+ x- r8 E, ^ f0 h7 }. C* h
url TEXT NOT NULL,% K1 o$ }6 j% A9 w
html TEXT NOT NULL,
$ @, t* S0 f+ z/ R
latex TEXT* d" A3 r) H& Y
);5 S$ e: w5 K7 d6 k: g( `! @
""")) y3 x2 `( |9 a4 V! F
) `/ D5 H3 b; j2 g+ \, p) a! M
conn.commit()) `" y( ^. M- ?: N7 j
3 X ]5 |7 S+ k9 t; i
2 R% @/ a; U3 r, X: W6 B, q
def latexify(string):% W4 p/ X' l- ~9 v) [" v7 A' Z
result = ""
5 e2 x: x$ [, x; u% P* {
trimmed_string = re.sub(r"\s{2,}", " ", string)3 \4 O0 a g# @2 Q2 r7 E
for char in re.sub(r"\s+([^\w'"({[])", r"\1", trimmed_string):
& z/ y( v1 F% |& z2 {
if char == "%":
# M9 V' d8 j* a$ J( K6 j. W% b, d
result += r"\%"
4 O/ a$ ?% p' H8 W: W* A
elif char == "\":
$ C9 s# y' l7 l7 U' V1 }+ }
result += r"\textbackslash{}"
6 A+ N/ n4 o9 F* _# [
elif char == "$":& A0 g! r& ?( Q
result += r"\$") y7 W2 f `) n/ V. z
elif char == "#":
' a1 ?4 m0 v2 L3 A( |
result += r"\#"+ H8 z5 _2 R' y c i. B
elif char == "&":
9 f1 T2 i1 A" `
result += r"\&"
7 e! z* n5 \+ h, c0 ], o1 G
elif char == "{":
, s, w( E/ y6 z: S
result += r"\{"
2 V6 a; S5 [: ]* J4 c
elif char == "}":; j% L k$ ?3 G5 |, x8 T1 N# w) {1 r
result += r"\}"+ r( P7 f5 l3 [+ ]
elif char == "^":# B/ r6 `2 n+ q2 n; c& c3 s9 m
result += r"\^"
2 s B" X! x+ g& X
elif char == "_":
! j. _- h7 V3 ~# O O* X
result += r"\_"1 Z8 x8 F7 v" o, u5 y6 v$ k
elif char == "~":( F6 l$ ?7 e- g( r( `" \" l6 k
result += "r\textasciitilde{}"
1 h, a0 H; ^1 F5 E- D* F1 C5 ^
else:
! P- b9 p) E) r8 h7 \
result += char
/ q) t% z% ]$ d: {; u# C8 u7 G6 l
return result8 c' p9 s6 s: }- V i5 W% X$ j6 C) u
# m( z& p0 [5 m. a
def latexify_html(beautifulsoup_object):
: R. m1 j# o1 g
try:
) D; ], U" Z" z# E( p2 j( v( k0 \; O
return latexify(re.sub(r"\s{2,}", " ", beautifulsoup_object.get_text().replace("\n", " ")).strip())9 f6 R+ ~- {4 k f3 O( Z% g
except:
2 M. H* \ m1 i3 r, z2 z
return latexify(beautifulsoup_object)
$ ^. W" p7 b! B. D+ B4 W
" v- O) K& r$ O/ W9 r$ _/ a
. y& B5 ~1 T2 J* \, g3 [
class CambridgeDictionaryScraper:7 D4 g- Q# R, d4 G4 O% X
""" Scraper for Cambridge Dictionary """1 k _% o6 d+ p# V% w
url_set = set() ## Shared by all instance
" Q6 \2 {$ d0 O; v
" u9 v% N7 n1 h
def __init__(self):7 J+ t. q! m* F5 M9 V S" b+ o
for item in conn.execute("SELECT url FROM cambridge;"):
" f0 e6 t0 |) w- T% K3 }8 c
self.url_set.add(item[0])
+ t" ~! b. S2 R
print("Already Downloaded " + str(len(self.url_set)) + " Words!")
5 G4 F: K3 V0 ^8 g7 p: q
0 J" o1 e5 f# _! {" w' H
2 S# x; J2 n; p- m
def __del__(self):% `3 o, K( Y5 o1 q
conn.commit()
& E9 x/ ~* D# E& Z# P% [
4 l4 a' }6 J$ s% y
2 I' V1 s1 K1 b& d8 T( n
def get_word_page(self, url):
( }! f- x7 [3 o a* w
r = requests.get(url, headers=header) - T4 W* q# d8 N
bs_obj = BeautifulSoup(r.text, "html.parser")6 H: P% r( W3 p# S$ U: y; I6 c
entry_tag = bs_obj.find("div", {"class": "entry"})4 M- i. @6 Q, D3 g" ^6 G- n
if not entry_tag:3 e. y3 J9 {8 k; K, V
entry_tag = bs_obj.find("div", {"class": "di-body"})3 ]: d2 ]+ ?- K" e, Y0 }: t) b
if not entry_tag:. Y q* B% H$ Z9 e$ k
## Beta Words, B4 w& Q" X$ d. e2 e
entry_tag = bs_obj.find("div", {"id": "entryContent"})% C5 B) S* ?/ p$ j8 m
if not entry_tag:
0 P$ n: D8 X7 w
entry_tag = bs_obj
. }8 g0 R9 t7 K. S C
3 u0 l/ Z+ u+ }9 F1 ^
if not entry_tag:) U2 v3 m, E2 R4 S. E: ^
entry_tag = bs_obj
/ j5 d. f" V+ d' T8 q8 C
6 {$ c9 \5 G. |1 b; ?
for tag in entry_tag.find_all("script"):. a! x# J/ C5 i& } `' x3 K
tag.extract()
6 s1 u5 V! Q- J
) d3 R: E- F$ N4 Y1 S/ T& m. s% I- _
result_string = str(entry_tag)* F2 S& p: P/ ~4 Z
return result_string
# t% d- C# B A9 O
J. H3 x# D2 z- d, x9 s' O
& ^$ o! w$ H1 Z6 H" a: {) ~
def start(self, url):$ c9 l( f3 `- W( V, q2 D" |4 A
r = requests.get(url, headers=header)' x# H6 e' o @- A% q: [! k* I
( n+ |2 g# B! L. M; g; ]
bs_obj = BeautifulSoup(r.text, "html.parser")
1 ?, X2 c3 j- p# ~8 U7 x! f6 s
3 Z7 X0 M# ^6 v. C% e) O+ N: b
for li_tag in bs_obj.select("li.lpr-10"):8 l7 }8 n- A, Q! |3 y1 _4 W7 a; ^
child_url = urljoin(cambridge_url, li_tag.a.attrs["href"])- D' j" P7 D% g" \- E
print(child_url)! c0 Y% K% x) s5 `" j( o/ O+ d
self.find_child_entry(child_url)
- n& ~' Y% g$ K o H( @- ^) b
! T8 q+ v. i* d3 e
; y% w2 J# _) k! z" B, {9 a
def find_child_entry(self, url):
g1 h* S, p6 N4 D9 _
r = requests.get(url, headers=header)
9 a0 [8 U3 y* {& h: S; B
bs_obj = BeautifulSoup(r.text, "html.parser")8 {9 g, e0 }, X9 U
for li_tag in bs_obj.select("li.t-i"):
+ z! y. p4 ]& d: h% m+ ?
child_url = urljoin(url, li_tag.a.attrs["href"]).strip()" i- M: L" @- B2 l! R7 P
child_text = li_tag.get_text().strip()
9 L- H3 S' H4 h# a- d
if "..." in child_text:; k& A" R# [- h" t8 g: O
self.find_child_entry(child_url)
, w* j: v0 m! c1 H$ H0 z+ b$ J
else:
7 \1 h: x: h: ]- b! e
if child_url in self.url_set:
. }% n: u9 U; I8 O4 N) V# b( h, _
continue
2 x, ^, z8 K+ N/ B; m# ?
print(child_text + "\t" + child_url)
4 J5 X1 B+ y/ M, M
conn.execute("INSERT INTO cambridge (entry, url, html) VALUES (?, ?, ?);",
1 _- n) ^. m3 f4 z; G1 ?* ?2 z
(child_text, child_url, self.get_word_page(child_url)))
) g/ m) I: J7 `! Y' z- l
conn.commit()& n/ \. D( C) `! J! @
self.url_set.add(child_url)" I7 t1 e- F {; k. I! {
9 l8 R( y6 J9 {! S4 K
" Z X, s7 h7 t% \- l
class CambridgeDictionaryExtractor():
7 p% h c0 v* ?1 S6 I" y- v& w& _
def __init__(self, entry, entry_html = ""):
- x7 t' R, R) p" O# I8 _ i
self.entry = latexify(entry)
+ o8 V+ w6 W6 w
self.entry_html = entry_html/ Q4 H4 m" Q- y6 {+ l6 t/ v
self.result = ""
4 U. ^% _) j6 a; k: \4 f; n
: y; f2 F- s' }" W; g3 o
# def __del__(self):
; {5 b! f) l# ]5 c
# pass
$ T4 M9 y; m! y: G7 T6 ]
8 X" j1 e( A# {
C; ]* [+ F+ \
def extract(self):; Z4 i$ g% u" z, ^8 M
"""
& x& n3 b: k& O+ M2 j4 U4 a
<div class="pr idiom-block">
; V. X) r" b& m- c9 i, w6 K
<div class="idiom-block"></div>
: M1 \- A L x0 }# v
</div>
# H* p, l( t- d* X* k6 h7 @. I
""": V5 u; H3 G( p/ w' K# U3 ?# t) L0 q
bs_obj = BeautifulSoup(self.entry_html, "html.parser")
6 K# m. j$ Y" R6 r: D/ p
self.result += "\\begin{entry}{" + self.entry + "}"3 @( r$ x+ D1 d3 _2 X8 T. I
for part_of_speech_tag in bs_obj.find_all("div", {"class": "entry-body__el"}):6 M& _6 o* T4 J$ K1 G* x# t$ I' k
self.result += "\n\\begin{Partofspeech}\n" + self.process_part_of_speech(part_of_speech_tag) + "\n\\end{Partofspeech}") j6 H% K& W6 W0 `) I
idiom_block = bs_obj.find("div", {"class": "idiom-block"})
+ |( U- P& u7 X
if idiom_block:( F k+ s2 q/ i# V
for idiom_block in idiom_block.find_all("div", {"class": "idiom-block"}):
& M# w: R; c+ ?. `) c
self.result += "\n\\begin{idiom}" + self.process_idiom_block(idiom_block) + "\n\\end{idiom}"/ S+ S; w( ?- S8 A
self.result += "\n\\end{entry}\n\n"9 q# C( B/ {- d- t7 H: h2 R
% k+ k ]2 R$ E. b" s/ _
# O9 i' `% e a5 F! U
def process_idiom_block(self, idiom_block):
& Z$ R) x( H3 ~+ R
result = ""
; N" M3 k" N/ P) B! t
idiom_body = idiom_block.find("span", {"class": "idiom-body"})
' t) }$ V Q( Y0 u. g
if idiom_body:
2 e# Z* z; n' O( N
for sense_tag in idiom_body.find_all("div", {"class": "dsense"}):& ?% k2 w) u% t0 t& I/ h$ P8 |- O
result += self.process_sense_tag(sense_tag)
* S& }7 y" Z2 B, T
return result
( Y! s+ q- b+ ]% g
) K; J3 Z; ]% O; t% d
+ Q4 N# G4 m9 c
5 J: f1 W, w* G5 C4 e
def get_smart_vocabulary(self, smart_vocabulary_tag):8 V% F; b! q" T- [& ~0 N% B3 o
result = ""
6 `. r: a, v/ C* Z5 x
for li_tag in smart_vocabulary_tag.find_all("li"):
E* Q; p9 ?3 a4 U9 V
result += "\\smart{" + latexify_html(li_tag) + "}\n"+ s- `5 Y# Y/ | {. Q# P7 E
return "\n\\begin{smartvocabulary}\n" + result + "\\end{smartvocabulary}\n"
2 R9 [* K9 x: t1 p$ m
, k1 W v9 Q( J W" N
, [! G1 _! b$ r2 j& J
def process_part_of_speech(self, part_of_speech_tag):3 ]% t" ]0 l; E& T w& u+ g( ?
"""1 m) H' A6 L. Q( Q/ N1 Z3 S
<div class="entry-body__el"> Y( M/ q( Y' P/ D8 q7 ]
<div class="pos-header"></div>: Q- C! L$ R2 b1 {# y' q1 P
<div class="pos-body"></div>, y m8 X! R. X
<div class="pr relativDiv"></div>
; U4 ~% L1 I: q% q' Y
<div>) o# y+ l9 X. e9 K8 C0 n/ u+ Y
"""
- u6 j( D5 [2 M# _
result = ""8 m/ m( k0 M) U, ]% Y/ z
header_tag = part_of_speech_tag.find("div", {"class": "pos-header"})8 g' f9 b3 X- Z- T
if header_tag:
3 z, ~( V9 r3 s
result += self.process_part_of_speech_header(header_tag)
* T+ h5 Q' S4 K) E
body_tag = part_of_speech_tag.find("div", {"class": "pos-body"}), B% \3 a6 Q( h0 X( _. M
if body_tag:9 s3 c! Q8 `# `- H! P* u# q- T
result += self.process_part_of_speech_body(body_tag)
) e! V4 u1 F* D; G X; t$ }, M) o
pv_block_tag = part_of_speech_tag.find("div", {"class": "pv-block"})5 ?8 U5 W; a' O6 e4 v9 X: u
if pv_block_tag:
2 G$ J! Q0 |/ Z' }2 J% q U" C
result += self.process_pv_block(pv_block_tag)6 s0 e P1 K1 J; E) [3 [
return result.strip()
* F; }* K- M, [
5 o ~- F$ c# q/ y& N
% { |3 q# J3 D
def process_pv_block(self, tag): C& W, C8 x S; t9 y$ @! A
"""5 V5 W% ~, d, M& S5 B
<div class="pv-block">
4 R/ E9 L5 ]6 R$ f7 z
<div class="di-title"></div>9 D- q( A3 O' h- D1 r) g# r2 ~
<span clss="di-info"></span>
: @. K7 Z- r2 Y# s& v# m+ n
<span class="pv-body dpv-body">4 i4 j. z2 |6 P* w7 p
<div class="pr dsense dsense-noh">
# ?8 l) U. b$ V' F$ f5 }/ ~$ i
<span>
! a5 t9 V- N1 y7 l2 x
<div>
* }3 w& ?& C& H' c
"""
" D6 ]2 d7 J; j1 b
result = ""
# l. Q, y5 f5 E2 ~
for item in tag.find_all("div",{"class", "sense-body"}):
3 R l0 X- q& {5 l6 f% \
result += self.process_sense_body(item). [; S6 E* [, B- C
return result% {+ _& l! t7 n; Y$ K- [* ]7 f
8 |3 K: _4 C& T
' R; d3 D2 q. F/ c
3 U% {+ \6 n/ n* o6 Y: d
def process_part_of_speech_header(self, header_tag):
, |) Y+ R! P7 z k" i4 T4 _
result = ""
/ M u3 h% n- k- W; X2 j
# title_tag = header_tag.find("div", {"class": "di-title"})
# T, G% S. m+ r! v( w/ f
# if title_tag:* V; S) \- u0 c0 ~4 X5 j6 m# P
# result += process_header_title(title_tag)$ ^! c! B$ H. b$ i C
posgram_tag = header_tag.find("div", {"class": "posgram"}) E4 j4 i+ T- X& C! D
if posgram_tag:% O& T& Z1 t! o
result += self.process_part_of_speech_grammar(posgram_tag)' b0 ?7 ^0 h5 G3 F
for pronunciation_tag in header_tag.find_all("span", {"class": "dpron-i"}):
3 c" T/ `7 C: w$ }; ?3 Y
result += self.process_pronunciation(pronunciation_tag)) `2 c/ t3 K. O: ^1 {+ v1 p$ b
: A9 K) j: ~! V- Y
return result.strip()
! z# j. \2 r0 V; y
& h/ s3 A1 G+ t( T& w$ M. V
9 x( ]+ a3 C+ ^ {0 o
def process_header_title(self, title_tag):
. A5 _- ^' m; j+ N
## <span class="hw dhw">record</span>
2 X2 p9 f1 I# M! T% {( y$ J* q
result = ""& Y# o3 D8 r E, R0 j3 U! B' a
headword_tag = title_tag.find("span", {"class": "hw"}): l7 O' \9 Q. A% @0 A! G
if headword_tag:- r* f: k/ w- z# r5 u
result += "\\entry{" + latexify_html(headword_tag) + "}\n"
+ f) o- ?6 @; f4 s- L
else:* f4 d$ z Z* V
result += "\\entry{" + latexify_html(title_tag) + "}\n"8 z) d8 P0 J- |" q0 C
return result5 W0 D' u; O: I3 k
7 r3 A& Z4 s) N9 C1 e0 k
def process_part_of_speech_grammar(self, posgram_tag):2 H, y! E- ?- T# G, M' Z: _; S7 Z
result = "". F1 n$ N& H0 o! [) ]! z
part_of_speech_tag = posgram_tag.find("span", {"class": "pos"}), g* N% d5 Q5 B* m+ x) x
if part_of_speech_tag:6 Z' a1 \5 A# [' y
result += "\n\\pos{" + latexify_html(part_of_speech_tag) + "}"
- M5 X- z8 V7 t. R% h8 a4 ~! J
gram_tag = posgram_tag.find("span", {"class": "gc"})6 q7 o& F+ x l& l3 X8 Z
if gram_tag:
: [6 O- o0 t7 R
result += "\n\\posgram{" + latexify_html(gram_tag) + "}"
8 C& E0 H; b( W9 B, g% y9 ]5 ^
return result
5 m1 U0 Y) n: C2 Z! Z- d( y4 z- E
( {) c, p# ?) p
def process_pronunciation(self, pronunciation_tag):% Y! S) n* S6 |% M# t/ H1 R6 q
is_us_pronunciation = False
4 v4 [/ c! X |0 ~; s: l# {
if "us" in pronunciation_tag.attrs["class"]:& a; ]7 P- p0 @8 e1 s- \$ H
is_us_pronunciation = True- E" M1 D9 i! T* B
result = ""
. r* S2 K: {9 i4 o; F
audio_tag = pronunciation_tag.find("source", {"type": "audio/mpeg"})
) R$ B. {9 {" E' H9 M# {/ z
0 _9 [. X' L3 e6 |& ^
ipa_tag = pronunciation_tag.find("span", {"class": "ipa"}): K4 O* P/ l4 ~2 p
if ipa_tag:$ n! }2 |1 J) Y) F+ w i' I7 [
if is_us_pronunciation:
2 v( V# q; @) q6 `+ [4 I, j/ S
result += "\n\\ipaus{" + latexify_html(ipa_tag) + "}"
3 U7 d2 _; Z; o3 s" i- p
else:; a" s# O; y" P( M* }6 g5 u/ T6 A3 \
result += "\n\\ipauk{" + latexify_html(ipa_tag) + "}"/ R, N0 K* n0 G9 k& L8 L) D
if audio_tag:
1 z& o$ Z B$ O7 r
audio_url = urljoin("https://dictionary.cambridge.org/", audio_tag.attrs["src"])3 g3 K' A: E, U3 h$ E
result += "\n\pronuniation{" + audio_url + "}"
8 }( V" o# J; `# ^
return result
0 [/ l- W+ V( D8 E1 F, T; E) }% O
( R4 d) y, k9 R" h3 |5 P
! O0 m) J5 a; R( {& L
- [, Y* g& L3 `* d. J# R, g
def process_sense_head(self, tag):% t6 S7 \2 l) j \4 d& x& y8 ?
text = latexify_html(tag)4 ]' t" o. T3 e Q
if "(" in text:7 F% F! y0 A+ @ Z* J# G. D
left_bracket_index = text.index("(")
% l7 A; |( |, W3 F% o* f
else:
+ b3 u" B1 M5 F& m3 h
left_bracket_index = 0
$ g% b# J, h- L7 z- X5 h
if ")" in text:
! S; ~1 e9 F9 W
right_bracket_index = text.index(")")& |: }1 {/ X7 i2 ~' |: m3 V6 |
else:2 i. Z$ q6 j( o. f$ R) y6 Z
right_bracket_index = len(text)
4 i8 W9 Q& M. `+ t9 u7 i" U% f8 y
return "\n\\shortmeaning{" + text[left_bracket_index + 1: right_bracket_index].strip() + "}"
# n/ ~: J# j8 P( z! m
/ _' {! s7 W, C8 e( R
1 { [: w- O5 d& c: z
def get_definition(self, tag):
9 ]8 q% \- M Y/ ]* a/ B
result = ""8 K. J1 r; H" j" k, N/ n
for def_info_tag in tag.select("span.def-into"):
7 ~( _+ Z0 \9 l' A4 R" T/ v
result += latexify_html(def_info_tag)
( ]* U4 Q5 f8 P0 o" j6 J
for def_tag in tag.select("div.def"):
4 z0 d* a P- k* v0 h5 `
result += latexify_html(def_tag)3 V1 `: ~; w- g5 P" p+ A
return result" w0 N3 _ q' B2 j' @9 n
Z) ]* z- Q" I+ `4 f6 @0 M5 B; ~
: t7 J& T( C6 n4 l+ n
def process_def_block(self, tag):( ?% |3 V9 v& V. i5 {- T! u9 [) H
result = ""
def_tag = tag.find("div", {"class": "ddef_h"})5 p3 H1 N# f; E4 m
if def_tag: K K/ e" P5 @4 p6 Q8 a
result += "\n\\meaningen{" + self.get_definition(def_tag) + "}"
% _0 N# {8 F$ T* X+ {+ W
try:/ k3 Q: a, ~! }; R( p7 B
def_trans_tag = tag.select("span.trans")[0]( b1 n. \8 ?: o, G0 S( U* u
if def_trans_tag:
9 p/ j. L# L8 c+ C2 W
result += "\n\\meaningcn{" + latexify_html(def_trans_tag) + "}"
/ z, G0 {5 | h$ _: Y$ E, U# m, X
except:
Y% m" B" v) ?
pass
- F- _2 A8 ` R" p
for example_tag in tag.select("span.eg"):
8 c' I& \+ e5 J7 [
result += "\n\\example{" + latexify_html(example_tag) + "}"
) q; g$ f ~- q
return result
6 ~( H- _) t6 n7 H: t7 p! Y
* C3 J/ c' P7 Y
6 W# b3 L% E4 c. q
def process_phrase_block(self, phrase_block_tag):
' V! I3 e# H' n3 [- t
"""
: o3 J& {7 Y: \1 W! S( W
<div class="phrase-head dphrase_h">...</div>, a3 z! @! ?- p" |/ [& S
<div class="phrase-body dphrase_b">...</div>
5 l, M* e0 L/ q0 D. S0 q/ t$ {2 X, I
<div class="bb hax">...</div>
7 w) O; ~: R' |) D( ^/ c( ^9 A
""": Z. B' W" Z4 f+ @# q) W9 z0 P
result = "\\begin{phrase}{"
3 P7 t" k& e" {2 m
result += phrase_block_tag.select("span.phrase-title")[0].get_text().strip() + "}"
2 D1 `" T4 k% q- `# z
return result + "\\end{pharse}\n"( P0 o/ y4 X1 @8 P
& z6 c9 k# a' [8 B. a
def process_sense_body(self, tag):
8 _/ Y) ^2 M) J: [# G
"""# B# X& E* T2 y( r# H n: N
<div class="pr phrase-block dphrase-block">...</div>! E" D. Z0 ~2 H: y6 W8 f
<div class="def-block ddef_block">...</div>
+ S. Y0 {: `8 K# U
<div class="bb hax">...</div># u: `+ Q( {1 @9 n0 ?3 r
"""
8 R# u2 Y; Y, u7 ]# R
result = ""$ n, Y: d8 s& t' L
for def_block in tag.select("div.def-block"):
' t1 E8 A: J3 E$ ]* B: V- c
result += self.process_def_block(def_block)6 J: `" G" d6 s4 p& v
for phrase_block in tag.select("div.pharse-block"):* Q7 S, R0 a; F2 r( S9 i+ B4 _
result += self.process_phrase_block(phrase_block) q8 ?% }* W" t; m+ N8 m
return result; ~) m' t2 l$ H8 I
' ` s2 ^6 e. z- W H% o6 R
def process_sense_tag(self, sense_tag):& Y/ t, L/ V, E, Z- S
"""
) ], X: }3 }( j x5 _/ B
<h3 class="dsense_h">...</h3> s, D6 [; a* w1 y0 o1 U) n
<div class="sense-body dsense_b">...</div>* y/ J% U5 L" u4 T" i3 ~
<div class="smartt daccord">...</div> # Smart Vocabulary; z. I) ~6 Q) Q+ W) v3 Q
<div class="bb hax">...</div>
7 ^4 q2 V% J0 z! I; l- k1 E3 L
"""
3 h8 a8 ~! w" Q I9 E
result = ""
- r. F3 C3 ]2 ^6 Y$ S
sense_head_tag = sense_tag.find("h3", {"class": "dsense_h"})* @ p: K8 `: b# r% `* X( N
if sense_head_tag:
( @2 V" A' I, y9 _" @
result += self.process_sense_head(sense_head_tag)
; ?$ ?+ X4 s- P5 O: a; b) n' Z
for sense_body_tag in sense_tag.select("div.sense-body"):3 q# x" d) B6 K% \
result += self.process_sense_body(sense_body_tag)4 X# k5 ~9 {% B2 `! k$ b0 g$ ]
for smart_vocabulary_tag in sense_tag.find_all("div", {"class": "smartt"}):
" }$ ]5 T" C9 u& E9 [+ d
result += self.get_smart_vocabulary(smart_vocabulary_tag)
2 w- X: h) U+ U3 H8 N7 T
. _, V# L: x0 S2 ]
return result7 f6 q# }2 B8 k
" g& W2 s: `# O2 c, e+ y
def process_part_of_speech_body(self, body_tag):$ A+ a- b, p. D+ C# l0 E8 b) G
"""
6 k5 |" a6 w5 L i
<div class="pr dsense">...</div>0 m: q: y1 u6 `/ B
<div class="pr dsense">...</div>
1 m: k$ s( L0 W- C, ~+ e
""") T% S$ K7 e5 g# [
result = ""
, e3 t1 A# @4 |4 c
for sense_tag in body_tag.select("div.dsense"):
, v' t. Y8 {5 L2 ^: I0 U* h- G
result += self.process_sense_tag(sense_tag)
3 Y! I9 D$ ]$ L1 H+ x$ C3 F, T, h
return result7 W5 n N+ ], W) u3 {
3 A" h% D2 A0 j4 V! r% v; b
; @) H$ p: |, `8 `0 Y4 J3 @
if __name__ == "__main__":
/ W5 _/ N2 J. E: U3 r
string = ""
W6 K- @& f8 O( q$ ^7 g
CambridgeDictionaryScraper().start(cambridge_english_chinese_url)( L1 S: n, m4 p. ?6 p# o
for row in conn.execute("SELECT entry, url, html FROM cambridge;"):
; @' ~2 X9 A& t5 b- D9 u: `
entry = row[0] a+ w% `! t$ i
print(entry)* h& d% D, U( Q4 @5 g/ B& }; q8 a
url = row[1]
# D; K( w/ M* X9 r4 n+ a0 E
html = row[2]2 \# _& L# q. ~
record = CambridgeDictionaryExtractor(entry, entry_html=html)3 u4 h. l4 N# Y/ T6 F
record.extract() s1 V# U7 d" N1 V
string += record.result + "\n"
- M* J$ u7 v$ N, t+ i% v
#print(record.result). M) m9 u6 B+ Y& c h7 n
6 g" u( h y8 {. N$ @- L
with open("./final.tex", "w", encoding="utf-8") as f:
) J$ k* S* e; m; d* }9 ?1 i; j$ M
try:
) t( T* Q4 _5 c- u/ C
for char in ascii_lowercase:7 C+ r8 ?1 ?) V c4 P2 x3 A
string = string.replace("\\begin{entry}{" + char, "\\section{" + char + "}\n\n\\begin{entry}{" + char, 1)" Z( [' n! B3 V9 X6 c |
except:% i0 R4 Z B, I" l; c* C3 X
pass' m( h8 l/ ]/ H1 Y$ n$ I
/ _$ b7 F4 v5 R/ n7 B# P0 X% Q6 s3 l
f.write(string.replace("", "")), P9 _& z/ ]. Z
6 L1 |' \6 B) m" k6 A! @4 [
* d" h \/ g8 ?

复制代码

		自动登录	找回密码
密码			免费注册

[求助] 有没有合适的PYTHON抓字典的案例可参考？