TA的每日心情 | 开心 2021-1-4 22:53 |
---|
签到天数: 5 天 [LV.2]偶尔看看I
|
发表于 2021-1-4 02:18:57
|
显示全部楼层
- #!/usr/bin/env python3
* z6 @4 S5 ? H9 y6 n - # -*- coding: utf-8 -*- k; d7 O/ `0 T2 E ?: _
- - O @9 @3 ]2 W- o7 u8 O
- import os
; c' p5 [' I7 l0 n* z7 X6 P - import requests
* N v, |+ R; J1 I - import re: n: h0 g$ J" D3 c$ \0 l; Q
- import time
8 X, j0 T- I7 g: ` - import sqlite3* _+ {2 | [3 v6 f
, R' @- u$ d- C+ [- from string import ascii_lowercase ! B- n. ?; ^* g/ o- G
" ?2 {! {# q% P. f! {# j" E' u- from pathlib import Path
+ x& V5 b% E/ O" U2 G4 o - from urllib.parse import urljoin8 f1 @, k2 }! I* X
- from html.parser import HTMLParser
; k% X3 ]( a3 V9 E7 ? - from bs4 import BeautifulSoup
* P* f4 ?, @. n t& d+ g
: A: y( P! o4 R+ z0 ]- webster_url = "https://www.merriam-webster.com/browse/dictionary/"
+ {0 I6 z6 P: }& F- z - oxford_url = "https://www.oxfordlearnersdictionaries.com/browse/english/"1 r% x: L# Z* \+ {' E
- macmillan_url = "https://www.macmillandictionary.com/browse/british/"
' u J* k* z9 W0 i' `$ K - cambridge_url ="https://dictionary.cambridge.org/browse/english/"4 [; `- `' O) @! P7 y! ~- n
* P k( b5 N" D; E" Y- cambridge_english_chinese_url = "https://dictionary.cambridge.org/browse/english-chinese-simplified/"
" \) J8 E8 n) z5 W; b+ S; V - ! j7 A+ u( _2 Q! i: S* [. |9 A5 ~
- base_url = "https://dictionary.cambridge.org/search/direct/"4 b4 N q6 O& W( t
- header = {"User-Agent": "Chrome/87.0.4280.88 Safari/537.36"}
& A$ _+ E' z: A0 i; t0 ?. N( g - payload = {'datasetsearch': 'english-chinese-simplified', 'q': ""}% }- z9 c6 ^" v
; s" e4 O: G5 [+ q- conn = sqlite3.connect("Cambridge Advanced Learner's English-Chinese Dictionary.db")
7 {/ s% H5 J! N
: M- U) n7 c9 X* _- conn.execute("""
( ~! O$ p% C9 b6 R+ O2 g% R. T - CREATE TABLE IF NOT EXISTS cambridge (/ K3 B5 L( y- w& N1 i/ s$ {- D
- entry TEXT PRIMARY KEY
! k- v' W0 c" F2 q- | - UNIQUE$ v r6 M7 w& g( g0 c
- NOT NULL,/ p0 a+ x- r8 E, ^ f0 h7 }. C* h
- url TEXT NOT NULL,% K1 o$ }6 j% A9 w
- html TEXT NOT NULL,
$ @, t* S0 f+ z/ R - latex TEXT* d" A3 r) H& Y
- );5 S$ e: w5 K7 d6 k: g( `! @
- """)) y3 x2 `( |9 a4 V! F
- ) `/ D5 H3 b; j2 g+ \, p) a! M
- conn.commit()) `" y( ^. M- ?: N7 j
3 X ]5 |7 S+ k9 t; i- 2 R% @/ a; U3 r, X: W6 B, q
- def latexify(string):% W4 p/ X' l- ~9 v) [" v7 A' Z
- result = ""
5 e2 x: x$ [, x; u% P* { - trimmed_string = re.sub(r"\s{2,}", " ", string)3 \4 O0 a g# @2 Q2 r7 E
- for char in re.sub(r"\s+([^\w'"({[])", r"\1", trimmed_string):
& z/ y( v1 F% |& z2 { - if char == "%":
# M9 V' d8 j* a$ J( K6 j. W% b, d - result += r"\%"
4 O/ a$ ?% p' H8 W: W* A - elif char == "\":
$ C9 s# y' l7 l7 U' V1 }+ } - result += r"\textbackslash{}"
6 A+ N/ n4 o9 F* _# [ - elif char == "$":& A0 g! r& ?( Q
- result += r"\$") y7 W2 f `) n/ V. z
- elif char == "#":
' a1 ?4 m0 v2 L3 A( | - result += r"\#"+ H8 z5 _2 R' y c i. B
- elif char == "&":
9 f1 T2 i1 A" ` - result += r"\&"
7 e! z* n5 \+ h, c0 ], o1 G - elif char == "{":
, s, w( E/ y6 z: S - result += r"\{"
2 V6 a; S5 [: ]* J4 c - elif char == "}":; j% L k$ ?3 G5 |, x8 T1 N# w) {1 r
- result += r"\}"+ r( P7 f5 l3 [+ ]
- elif char == "^":# B/ r6 `2 n+ q2 n; c& c3 s9 m
- result += r"\^"
2 s B" X! x+ g& X - elif char == "_":
! j. _- h7 V3 ~# O O* X - result += r"\_"1 Z8 x8 F7 v" o, u5 y6 v$ k
- elif char == "~":( F6 l$ ?7 e- g( r( `" \" l6 k
- result += "r\textasciitilde{}"
1 h, a0 H; ^1 F5 E- D* F1 C5 ^ - else:
! P- b9 p) E) r8 h7 \ - result += char
/ q) t% z% ]$ d: {; u# C8 u7 G6 l - return result8 c' p9 s6 s: }- V i5 W% X$ j6 C) u
# m( z& p0 [5 m. a- def latexify_html(beautifulsoup_object):
: R. m1 j# o1 g - try:
) D; ], U" Z" z# E( p2 j( v( k0 \; O - return latexify(re.sub(r"\s{2,}", " ", beautifulsoup_object.get_text().replace("\n", " ")).strip())9 f6 R+ ~- {4 k f3 O( Z% g
- except:
2 M. H* \ m1 i3 r, z2 z - return latexify(beautifulsoup_object)
$ ^. W" p7 b! B. D+ B4 W
" v- O) K& r$ O/ W9 r$ _/ a- . y& B5 ~1 T2 J* \, g3 [
- class CambridgeDictionaryScraper:7 D4 g- Q# R, d4 G4 O% X
- """ Scraper for Cambridge Dictionary """1 k _% o6 d+ p# V% w
- url_set = set() ## Shared by all instance
" Q6 \2 {$ d0 O; v - " u9 v% N7 n1 h
- def __init__(self):7 J+ t. q! m* F5 M9 V S" b+ o
- for item in conn.execute("SELECT url FROM cambridge;"):
" f0 e6 t0 |) w- T% K3 }8 c - self.url_set.add(item[0])
+ t" ~! b. S2 R - print("Already Downloaded " + str(len(self.url_set)) + " Words!")
5 G4 F: K3 V0 ^8 g7 p: q - 0 J" o1 e5 f# _! {" w' H
- 2 S# x; J2 n; p- m
- def __del__(self):% `3 o, K( Y5 o1 q
- conn.commit()
& E9 x/ ~* D# E& Z# P% [ - 4 l4 a' }6 J$ s% y
- 2 I' V1 s1 K1 b& d8 T( n
- def get_word_page(self, url):
( }! f- x7 [3 o a* w - r = requests.get(url, headers=header) - T4 W* q# d8 N
- bs_obj = BeautifulSoup(r.text, "html.parser")6 H: P% r( W3 p# S$ U: y; I6 c
- entry_tag = bs_obj.find("div", {"class": "entry"})4 M- i. @6 Q, D3 g" ^6 G- n
- if not entry_tag:3 e. y3 J9 {8 k; K, V
- entry_tag = bs_obj.find("div", {"class": "di-body"})3 ]: d2 ]+ ?- K" e, Y0 }: t) b
- if not entry_tag:. Y q* B% H$ Z9 e$ k
- ## Beta Words, B4 w& Q" X$ d. e2 e
- entry_tag = bs_obj.find("div", {"id": "entryContent"})% C5 B) S* ?/ p$ j8 m
- if not entry_tag:
0 P$ n: D8 X7 w - entry_tag = bs_obj
. }8 g0 R9 t7 K. S C
3 u0 l/ Z+ u+ }9 F1 ^- if not entry_tag:) U2 v3 m, E2 R4 S. E: ^
- entry_tag = bs_obj
/ j5 d. f" V+ d' T8 q8 C -
6 {$ c9 \5 G. |1 b; ? - for tag in entry_tag.find_all("script"):. a! x# J/ C5 i& } `' x3 K
- tag.extract()
6 s1 u5 V! Q- J - ) d3 R: E- F$ N4 Y1 S/ T& m. s% I- _
- result_string = str(entry_tag)* F2 S& p: P/ ~4 Z
- return result_string
# t% d- C# B A9 O
J. H3 x# D2 z- d, x9 s' O-
& ^$ o! w$ H1 Z6 H" a: {) ~ - def start(self, url):$ c9 l( f3 `- W( V, q2 D" |4 A
- r = requests.get(url, headers=header)' x# H6 e' o @- A% q: [! k* I
( n+ |2 g# B! L. M; g; ]- bs_obj = BeautifulSoup(r.text, "html.parser")
1 ?, X2 c3 j- p# ~8 U7 x! f6 s
3 Z7 X0 M# ^6 v. C% e) O+ N: b- for li_tag in bs_obj.select("li.lpr-10"):8 l7 }8 n- A, Q! |3 y1 _4 W7 a; ^
- child_url = urljoin(cambridge_url, li_tag.a.attrs["href"])- D' j" P7 D% g" \- E
- print(child_url)! c0 Y% K% x) s5 `" j( o/ O+ d
- self.find_child_entry(child_url)
- n& ~' Y% g$ K o H( @- ^) b - ! T8 q+ v. i* d3 e
; y% w2 J# _) k! z" B, {9 a- def find_child_entry(self, url):
g1 h* S, p6 N4 D9 _ - r = requests.get(url, headers=header)
9 a0 [8 U3 y* {& h: S; B - bs_obj = BeautifulSoup(r.text, "html.parser")8 {9 g, e0 }, X9 U
- for li_tag in bs_obj.select("li.t-i"):
+ z! y. p4 ]& d: h% m+ ? - child_url = urljoin(url, li_tag.a.attrs["href"]).strip()" i- M: L" @- B2 l! R7 P
- child_text = li_tag.get_text().strip()
9 L- H3 S' H4 h# a- d - if "..." in child_text:; k& A" R# [- h" t8 g: O
- self.find_child_entry(child_url)
, w* j: v0 m! c1 H$ H0 z+ b$ J - else:
7 \1 h: x: h: ]- b! e - if child_url in self.url_set:
. }% n: u9 U; I8 O4 N) V# b( h, _ - continue
2 x, ^, z8 K+ N/ B; m# ? - print(child_text + "\t" + child_url)
4 J5 X1 B+ y/ M, M - conn.execute("INSERT INTO cambridge (entry, url, html) VALUES (?, ?, ?);",
1 _- n) ^. m3 f4 z; G1 ?* ?2 z - (child_text, child_url, self.get_word_page(child_url)))
) g/ m) I: J7 `! Y' z- l - conn.commit()& n/ \. D( C) `! J! @
- self.url_set.add(child_url)" I7 t1 e- F {; k. I! {
- 9 l8 R( y6 J9 {! S4 K
" Z X, s7 h7 t% \- l- class CambridgeDictionaryExtractor():
7 p% h c0 v* ?1 S6 I" y- v& w& _ - def __init__(self, entry, entry_html = ""):
- x7 t' R, R) p" O# I8 _ i - self.entry = latexify(entry)
+ o8 V+ w6 W6 w - self.entry_html = entry_html/ Q4 H4 m" Q- y6 {+ l6 t/ v
- self.result = ""
4 U. ^% _) j6 a; k: \4 f; n
: y; f2 F- s' }" W; g3 o- # def __del__(self):
; {5 b! f) l# ]5 c - # pass
$ T4 M9 y; m! y: G7 T6 ] - 8 X" j1 e( A# {
C; ]* [+ F+ \- def extract(self):; Z4 i$ g% u" z, ^8 M
- """
& x& n3 b: k& O+ M2 j4 U4 a - <div class="pr idiom-block">
; V. X) r" b& m- c9 i, w6 K - <div class="idiom-block"></div>
: M1 \- A L x0 }# v - </div>
# H* p, l( t- d* X* k6 h7 @. I - """: V5 u; H3 G( p/ w' K# U3 ?# t) L0 q
- bs_obj = BeautifulSoup(self.entry_html, "html.parser")
6 K# m. j$ Y" R6 r: D/ p - self.result += "\\begin{entry}{" + self.entry + "}"3 @( r$ x+ D1 d3 _2 X8 T. I
- for part_of_speech_tag in bs_obj.find_all("div", {"class": "entry-body__el"}):6 M& _6 o* T4 J$ K1 G* x# t$ I' k
- self.result += "\n\\begin{Partofspeech}\n" + self.process_part_of_speech(part_of_speech_tag) + "\n\\end{Partofspeech}") j6 H% K& W6 W0 `) I
- idiom_block = bs_obj.find("div", {"class": "idiom-block"})
+ |( U- P& u7 X - if idiom_block:( F k+ s2 q/ i# V
- for idiom_block in idiom_block.find_all("div", {"class": "idiom-block"}):
& M# w: R; c+ ?. `) c - self.result += "\n\\begin{idiom}" + self.process_idiom_block(idiom_block) + "\n\\end{idiom}"/ S+ S; w( ?- S8 A
- self.result += "\n\\end{entry}\n\n"9 q# C( B/ {- d- t7 H: h2 R
- % k+ k ]2 R$ E. b" s/ _
- # O9 i' `% e a5 F! U
- def process_idiom_block(self, idiom_block):
& Z$ R) x( H3 ~+ R - result = ""
; N" M3 k" N/ P) B! t - idiom_body = idiom_block.find("span", {"class": "idiom-body"})
' t) }$ V Q( Y0 u. g - if idiom_body:
2 e# Z* z; n' O( N - for sense_tag in idiom_body.find_all("div", {"class": "dsense"}):& ?% k2 w) u% t0 t& I/ h$ P8 |- O
- result += self.process_sense_tag(sense_tag)
* S& }7 y" Z2 B, T - return result
( Y! s+ q- b+ ]% g - ) K; J3 Z; ]% O; t% d
- + Q4 N# G4 m9 c
- 5 J: f1 W, w* G5 C4 e
- def get_smart_vocabulary(self, smart_vocabulary_tag):8 V% F; b! q" T- [& ~0 N% B3 o
- result = ""
6 `. r: a, v/ C* Z5 x - for li_tag in smart_vocabulary_tag.find_all("li"):
E* Q; p9 ?3 a4 U9 V - result += "\\smart{" + latexify_html(li_tag) + "}\n"+ s- `5 Y# Y/ | {. Q# P7 E
- return "\n\\begin{smartvocabulary}\n" + result + "\\end{smartvocabulary}\n"
2 R9 [* K9 x: t1 p$ m - , k1 W v9 Q( J W" N
- , [! G1 _! b$ r2 j& J
- def process_part_of_speech(self, part_of_speech_tag):3 ]% t" ]0 l; E& T w& u+ g( ?
- """1 m) H' A6 L. Q( Q/ N1 Z3 S
- <div class="entry-body__el"> Y( M/ q( Y' P/ D8 q7 ]
- <div class="pos-header"></div>: Q- C! L$ R2 b1 {# y' q1 P
- <div class="pos-body"></div>, y m8 X! R. X
- <div class="pr relativDiv"></div>
; U4 ~% L1 I: q% q' Y - <div>) o# y+ l9 X. e9 K8 C0 n/ u+ Y
- """
- u6 j( D5 [2 M# _ - result = ""8 m/ m( k0 M) U, ]% Y/ z
- header_tag = part_of_speech_tag.find("div", {"class": "pos-header"})8 g' f9 b3 X- Z- T
- if header_tag:
3 z, ~( V9 r3 s - result += self.process_part_of_speech_header(header_tag)
* T+ h5 Q' S4 K) E - body_tag = part_of_speech_tag.find("div", {"class": "pos-body"}), B% \3 a6 Q( h0 X( _. M
- if body_tag:9 s3 c! Q8 `# `- H! P* u# q- T
- result += self.process_part_of_speech_body(body_tag)
) e! V4 u1 F* D; G X; t$ }, M) o - pv_block_tag = part_of_speech_tag.find("div", {"class": "pv-block"})5 ?8 U5 W; a' O6 e4 v9 X: u
- if pv_block_tag:
2 G$ J! Q0 |/ Z' }2 J% q U" C - result += self.process_pv_block(pv_block_tag)6 s0 e P1 K1 J; E) [3 [
- return result.strip()
* F; }* K- M, [ - 5 o ~- F$ c# q/ y& N
- % { |3 q# J3 D
- def process_pv_block(self, tag): C& W, C8 x S; t9 y$ @! A
- """5 V5 W% ~, d, M& S5 B
- <div class="pv-block">
4 R/ E9 L5 ]6 R$ f7 z - <div class="di-title"></div>9 D- q( A3 O' h- D1 r) g# r2 ~
- <span clss="di-info"></span>
: @. K7 Z- r2 Y# s& v# m+ n - <span class="pv-body dpv-body">4 i4 j. z2 |6 P* w7 p
- <div class="pr dsense dsense-noh">
# ?8 l) U. b$ V' F$ f5 }/ ~$ i - <span>
! a5 t9 V- N1 y7 l2 x - <div>
* }3 w& ?& C& H' c - """
" D6 ]2 d7 J; j1 b - result = ""
# l. Q, y5 f5 E2 ~ - for item in tag.find_all("div",{"class", "sense-body"}):
3 R l0 X- q& {5 l6 f% \ - result += self.process_sense_body(item). [; S6 E* [, B- C
- return result% {+ _& l! t7 n; Y$ K- [* ]7 f
- 8 |3 K: _4 C& T
' R; d3 D2 q. F/ c-
3 U% {+ \6 n/ n* o6 Y: d - def process_part_of_speech_header(self, header_tag):
, |) Y+ R! P7 z k" i4 T4 _ - result = ""
/ M u3 h% n- k- W; X2 j - # title_tag = header_tag.find("div", {"class": "di-title"})
# T, G% S. m+ r! v( w/ f - # if title_tag:* V; S) \- u0 c0 ~4 X5 j6 m# P
- # result += process_header_title(title_tag)$ ^! c! B$ H. b$ i C
- posgram_tag = header_tag.find("div", {"class": "posgram"}) E4 j4 i+ T- X& C! D
- if posgram_tag:% O& T& Z1 t! o
- result += self.process_part_of_speech_grammar(posgram_tag)' b0 ?7 ^0 h5 G3 F
- for pronunciation_tag in header_tag.find_all("span", {"class": "dpron-i"}):
3 c" T/ `7 C: w$ }; ?3 Y - result += self.process_pronunciation(pronunciation_tag)) `2 c/ t3 K. O: ^1 {+ v1 p$ b
: A9 K) j: ~! V- Y- return result.strip()
! z# j. \2 r0 V; y
& h/ s3 A1 G+ t( T& w$ M. V
9 x( ]+ a3 C+ ^ {0 o- def process_header_title(self, title_tag):
. A5 _- ^' m; j+ N - ## <span class="hw dhw">record</span>
2 X2 p9 f1 I# M! T% {( y$ J* q - result = ""& Y# o3 D8 r E, R0 j3 U! B' a
- headword_tag = title_tag.find("span", {"class": "hw"}): l7 O' \9 Q. A% @0 A! G
- if headword_tag:- r* f: k/ w- z# r5 u
- result += "\\entry{" + latexify_html(headword_tag) + "}\n"
+ f) o- ?6 @; f4 s- L - else:* f4 d$ z Z* V
- result += "\\entry{" + latexify_html(title_tag) + "}\n"8 z) d8 P0 J- |" q0 C
- return result5 W0 D' u; O: I3 k
- 7 r3 A& Z4 s) N9 C1 e0 k
- def process_part_of_speech_grammar(self, posgram_tag):2 H, y! E- ?- T# G, M' Z: _; S7 Z
- result = "". F1 n$ N& H0 o! [) ]! z
- part_of_speech_tag = posgram_tag.find("span", {"class": "pos"}), g* N% d5 Q5 B* m+ x) x
- if part_of_speech_tag:6 Z' a1 \5 A# [' y
- result += "\n\\pos{" + latexify_html(part_of_speech_tag) + "}"
- M5 X- z8 V7 t. R% h8 a4 ~! J - gram_tag = posgram_tag.find("span", {"class": "gc"})6 q7 o& F+ x l& l3 X8 Z
- if gram_tag:
: [6 O- o0 t7 R - result += "\n\\posgram{" + latexify_html(gram_tag) + "}"
8 C& E0 H; b( W9 B, g% y9 ]5 ^ - return result
5 m1 U0 Y) n: C2 Z! Z- d( y4 z- E
( {) c, p# ?) p- def process_pronunciation(self, pronunciation_tag):% Y! S) n* S6 |% M# t/ H1 R6 q
- is_us_pronunciation = False
4 v4 [/ c! X |0 ~; s: l# { - if "us" in pronunciation_tag.attrs["class"]:& a; ]7 P- p0 @8 e1 s- \$ H
- is_us_pronunciation = True- E" M1 D9 i! T* B
- result = ""
. r* S2 K: {9 i4 o; F - audio_tag = pronunciation_tag.find("source", {"type": "audio/mpeg"})
) R$ B. {9 {" E' H9 M# {/ z - 0 _9 [. X' L3 e6 |& ^
- ipa_tag = pronunciation_tag.find("span", {"class": "ipa"}): K4 O* P/ l4 ~2 p
- if ipa_tag:$ n! }2 |1 J) Y) F+ w i' I7 [
- if is_us_pronunciation:
2 v( V# q; @) q6 `+ [4 I, j/ S - result += "\n\\ipaus{" + latexify_html(ipa_tag) + "}"
3 U7 d2 _; Z; o3 s" i- p - else:; a" s# O; y" P( M* }6 g5 u/ T6 A3 \
- result += "\n\\ipauk{" + latexify_html(ipa_tag) + "}"/ R, N0 K* n0 G9 k& L8 L) D
- if audio_tag:
1 z& o$ Z B$ O7 r - audio_url = urljoin("https://dictionary.cambridge.org/", audio_tag.attrs["src"])3 g3 K' A: E, U3 h$ E
- result += "\n\pronuniation{" + audio_url + "}"
8 }( V" o# J; `# ^ - return result
0 [/ l- W+ V( D8 E1 F, T; E) }% O - ( R4 d) y, k9 R" h3 |5 P
- ! O0 m) J5 a; R( {& L
- [, Y* g& L3 `* d. J# R, g- def process_sense_head(self, tag):% t6 S7 \2 l) j \4 d& x& y8 ?
- text = latexify_html(tag)4 ]' t" o. T3 e Q
- if "(" in text:7 F% F! y0 A+ @ Z* J# G. D
- left_bracket_index = text.index("(")
% l7 A; |( |, W3 F% o* f - else:
+ b3 u" B1 M5 F& m3 h - left_bracket_index = 0
$ g% b# J, h- L7 z- X5 h - if ")" in text:
! S; ~1 e9 F9 W - right_bracket_index = text.index(")")& |: }1 {/ X7 i2 ~' |: m3 V6 |
- else:2 i. Z$ q6 j( o. f$ R) y6 Z
- right_bracket_index = len(text)
4 i8 W9 Q& M. `+ t9 u7 i" U% f8 y - return "\n\\shortmeaning{" + text[left_bracket_index + 1: right_bracket_index].strip() + "}"
# n/ ~: J# j8 P( z! m
/ _' {! s7 W, C8 e( R
1 { [: w- O5 d& c: z- def get_definition(self, tag):
9 ]8 q% \- M Y/ ]* a/ B - result = ""8 K. J1 r; H" j" k, N/ n
- for def_info_tag in tag.select("span.def-into"):
7 ~( _+ Z0 \9 l' A4 R" T/ v - result += latexify_html(def_info_tag)
( ]* U4 Q5 f8 P0 o" j6 J - for def_tag in tag.select("div.def"):
4 z0 d* a P- k* v0 h5 ` - result += latexify_html(def_tag)3 V1 `: ~; w- g5 P" p+ A
- return result" w0 N3 _ q' B2 j' @9 n
- Z) ]* z- Q" I+ `4 f6 @0 M5 B; ~
: t7 J& T( C6 n4 l+ n- def process_def_block(self, tag):( ?% |3 V9 v& V. i5 {- T! u9 [) H
- result = ""
`2 e8 e9 o ?4 ]! ~* u: v& j* |; k - def_tag = tag.find("div", {"class": "ddef_h"})5 p3 H1 N# f; E4 m
- if def_tag: K K/ e" P5 @4 p6 Q8 a
- result += "\n\\meaningen{" + self.get_definition(def_tag) + "}"
% _0 N# {8 F$ T* X+ {+ W - try:/ k3 Q: a, ~! }; R( p7 B
- def_trans_tag = tag.select("span.trans")[0]( b1 n. \8 ?: o, G0 S( U* u
- if def_trans_tag:
9 p/ j. L# L8 c+ C2 W - result += "\n\\meaningcn{" + latexify_html(def_trans_tag) + "}"
/ z, G0 {5 | h$ _: Y$ E, U# m, X - except:
Y% m" B" v) ? - pass
- F- _2 A8 ` R" p - for example_tag in tag.select("span.eg"):
8 c' I& \+ e5 J7 [ - result += "\n\\example{" + latexify_html(example_tag) + "}"
) q; g$ f ~- q - return result
6 ~( H- _) t6 n7 H: t7 p! Y - * C3 J/ c' P7 Y
6 W# b3 L% E4 c. q- def process_phrase_block(self, phrase_block_tag):
' V! I3 e# H' n3 [- t - """
: o3 J& {7 Y: \1 W! S( W - <div class="phrase-head dphrase_h">...</div>, a3 z! @! ?- p" |/ [& S
- <div class="phrase-body dphrase_b">...</div>
5 l, M* e0 L/ q0 D. S0 q/ t$ {2 X, I - <div class="bb hax">...</div>
7 w) O; ~: R' |) D( ^/ c( ^9 A - """: Z. B' W" Z4 f+ @# q) W9 z0 P
- result = "\\begin{phrase}{"
3 P7 t" k& e" {2 m - result += phrase_block_tag.select("span.phrase-title")[0].get_text().strip() + "}"
2 D1 `" T4 k% q- `# z - return result + "\\end{pharse}\n"( P0 o/ y4 X1 @8 P
& z6 c9 k# a' [8 B. a- def process_sense_body(self, tag):
8 _/ Y) ^2 M) J: [# G - """# B# X& E* T2 y( r# H n: N
- <div class="pr phrase-block dphrase-block">...</div>! E" D. Z0 ~2 H: y6 W8 f
- <div class="def-block ddef_block">...</div>
+ S. Y0 {: `8 K# U - <div class="bb hax">...</div># u: `+ Q( {1 @9 n0 ?3 r
- """
8 R# u2 Y; Y, u7 ]# R - result = ""$ n, Y: d8 s& t' L
- for def_block in tag.select("div.def-block"):
' t1 E8 A: J3 E$ ]* B: V- c - result += self.process_def_block(def_block)6 J: `" G" d6 s4 p& v
- for phrase_block in tag.select("div.pharse-block"):* Q7 S, R0 a; F2 r( S9 i+ B4 _
- result += self.process_phrase_block(phrase_block) q8 ?% }* W" t; m+ N8 m
- return result; ~) m' t2 l$ H8 I
-
' ` s2 ^6 e. z- W H% o6 R - def process_sense_tag(self, sense_tag):& Y/ t, L/ V, E, Z- S
- """
) ], X: }3 }( j x5 _/ B - <h3 class="dsense_h">...</h3> s, D6 [; a* w1 y0 o1 U) n
- <div class="sense-body dsense_b">...</div>* y/ J% U5 L" u4 T" i3 ~
- <div class="smartt daccord">...</div> # Smart Vocabulary; z. I) ~6 Q) Q+ W) v3 Q
- <div class="bb hax">...</div>
7 ^4 q2 V% J0 z! I; l- k1 E3 L - """
3 h8 a8 ~! w" Q I9 E - result = ""
- r. F3 C3 ]2 ^6 Y$ S - sense_head_tag = sense_tag.find("h3", {"class": "dsense_h"})* @ p: K8 `: b# r% `* X( N
- if sense_head_tag:
( @2 V" A' I, y9 _" @ - result += self.process_sense_head(sense_head_tag)
; ?$ ?+ X4 s- P5 O: a; b) n' Z - for sense_body_tag in sense_tag.select("div.sense-body"):3 q# x" d) B6 K% \
- result += self.process_sense_body(sense_body_tag)4 X# k5 ~9 {% B2 `! k$ b0 g$ ]
- for smart_vocabulary_tag in sense_tag.find_all("div", {"class": "smartt"}):
" }$ ]5 T" C9 u& E9 [+ d - result += self.get_smart_vocabulary(smart_vocabulary_tag)
2 w- X: h) U+ U3 H8 N7 T - . _, V# L: x0 S2 ]
- return result7 f6 q# }2 B8 k
" g& W2 s: `# O2 c, e+ y- def process_part_of_speech_body(self, body_tag):$ A+ a- b, p. D+ C# l0 E8 b) G
- """
6 k5 |" a6 w5 L i - <div class="pr dsense">...</div>0 m: q: y1 u6 `/ B
- <div class="pr dsense">...</div>
1 m: k$ s( L0 W- C, ~+ e - """) T% S$ K7 e5 g# [
- result = ""
, e3 t1 A# @4 |4 c - for sense_tag in body_tag.select("div.dsense"):
, v' t. Y8 {5 L2 ^: I0 U* h- G - result += self.process_sense_tag(sense_tag)
3 Y! I9 D$ ]$ L1 H+ x$ C3 F, T, h - return result7 W5 n N+ ], W) u3 {
-
3 A" h% D2 A0 j4 V! r% v; b - ; @) H$ p: |, `8 `0 Y4 J3 @
- if __name__ == "__main__":
/ W5 _/ N2 J. E: U3 r - string = ""
W6 K- @& f8 O( q$ ^7 g - CambridgeDictionaryScraper().start(cambridge_english_chinese_url)( L1 S: n, m4 p. ?6 p# o
- for row in conn.execute("SELECT entry, url, html FROM cambridge;"):
; @' ~2 X9 A& t5 b- D9 u: ` - entry = row[0] a+ w% `! t$ i
- print(entry)* h& d% D, U( Q4 @5 g/ B& }; q8 a
- url = row[1]
# D; K( w/ M* X9 r4 n+ a0 E - html = row[2]2 \# _& L# q. ~
- record = CambridgeDictionaryExtractor(entry, entry_html=html)3 u4 h. l4 N# Y/ T6 F
- record.extract() s1 V# U7 d" N1 V
- string += record.result + "\n"
- M* J$ u7 v$ N, t+ i% v - #print(record.result). M) m9 u6 B+ Y& c h7 n
- 6 g" u( h y8 {. N$ @- L
- with open("./final.tex", "w", encoding="utf-8") as f:
) J$ k* S* e; m; d* }9 ?1 i; j$ M - try:
) t( T* Q4 _5 c- u/ C - for char in ascii_lowercase:7 C+ r8 ?1 ?) V c4 P2 x3 A
- string = string.replace("\\begin{entry}{" + char, "\\section{" + char + "}\n\n\\begin{entry}{" + char, 1)" Z( [' n! B3 V9 X6 c |
- except:% i0 R4 Z B, I" l; c* C3 X
- pass' m( h8 l/ ]/ H1 Y$ n$ I
- / _$ b7 F4 v5 R/ n7 B# P0 X% Q6 s3 l
- f.write(string.replace("", "")), P9 _& z/ ]. Z
- 6 L1 |' \6 B) m" k6 A! @4 [
- * d" h \/ g8 ?
-
复制代码 |
|