TA的每日心情 | 开心 2021-1-4 22:53 |
---|
签到天数: 5 天 [LV.2]偶尔看看I
|
发表于 2021-1-4 02:18:57
|
显示全部楼层
- #!/usr/bin/env python3- T* {" w4 W9 Q9 R6 k+ F
- # -*- coding: utf-8 -*-
1 I( X9 ~0 o: H0 m1 q
) I: y5 J- ~/ e+ K, e( r. m& e- import os: a: H9 M9 ?# d, b& w
- import requests
4 W5 D; l1 C; g - import re! x3 G q* W8 ]6 o5 q
- import time
" P7 [% I" T0 I0 w8 y. ]5 F1 h - import sqlite3
5 A9 I# ~( M' V1 \
- C* ~4 a/ {+ O- from string import ascii_lowercase
! R# U$ e* m% u) g
4 H: Q; p' |1 @/ t* a# [9 Q- from pathlib import Path
& T' t5 }; S \2 X" S% r! y- I - from urllib.parse import urljoin+ w. m# c4 d1 w/ y; ]5 e: ~' E
- from html.parser import HTMLParser9 c9 W0 ?+ d1 o8 W' e I
- from bs4 import BeautifulSoup
) C$ Z1 P7 ^' P# r/ ? - 6 }, N/ i _) t% x3 p! U& I
- webster_url = "https://www.merriam-webster.com/browse/dictionary/"
5 s9 j: f& Z8 U# e - oxford_url = "https://www.oxfordlearnersdictionaries.com/browse/english/"5 t/ z( A* H A5 m
- macmillan_url = "https://www.macmillandictionary.com/browse/british/"
0 d9 p4 C, K3 n# ?7 J - cambridge_url ="https://dictionary.cambridge.org/browse/english/"+ @5 E4 g# ~# v% G7 p* I; T
- $ O4 h# s W- J) j, f* {
- cambridge_english_chinese_url = "https://dictionary.cambridge.org/browse/english-chinese-simplified/"
; [6 S* u6 h& T! ]( H8 A - + A3 n1 x% b6 f# g! P
- base_url = "https://dictionary.cambridge.org/search/direct/"
% \7 B4 M/ Q* E+ {5 f- y - header = {"User-Agent": "Chrome/87.0.4280.88 Safari/537.36"}
) L# T' `: G5 J f n7 e - payload = {'datasetsearch': 'english-chinese-simplified', 'q': ""}6 |1 T5 o5 G# S6 B0 ]: y" H% u3 t
- }! x) l# \; Q
- conn = sqlite3.connect("Cambridge Advanced Learner's English-Chinese Dictionary.db")* C$ A2 ~' m% u5 T
4 `2 W5 d7 L" ~- conn.execute("""
' E2 z- s* F, V! j A+ V) Z4 p - CREATE TABLE IF NOT EXISTS cambridge (; d3 u/ i# z W
- entry TEXT PRIMARY KEY( ?$ }+ H: j* D4 E8 Y$ X" K- a
- UNIQUE
3 a' C& c$ a4 c8 o2 q# ^$ L$ z - NOT NULL,
. u8 v+ F& f- s4 M. p - url TEXT NOT NULL,
6 H/ U) ]' B) i/ g; b8 ~* } - html TEXT NOT NULL,
4 u- u$ l0 o$ @+ ~; d, j! k1 ] - latex TEXT* ?9 b | f& z+ m b8 d
- );3 t7 k* H: Q6 K5 q! K" r
- """)
& n8 w. r: U" d9 O- l# O; Y1 [, k
1 N/ m+ d' \& F7 F; F1 L* i! h$ |- conn.commit()
z; i' k( q6 [
1 K2 G8 `9 \: O o8 c- " e+ i' u0 O/ \9 h" K1 O$ @
- def latexify(string):" G Z0 [; y0 N- h& n
- result = ""
4 E2 y; E+ ^9 n6 f! j0 u, b - trimmed_string = re.sub(r"\s{2,}", " ", string)7 {& t: C3 s. I
- for char in re.sub(r"\s+([^\w'"({[])", r"\1", trimmed_string):
, I( G4 Y! `9 H& ~; g$ b - if char == "%":
1 ?2 v% \ U( B - result += r"\%"8 ^0 `6 K4 V3 h% f$ I6 w
- elif char == "\":3 l- P' B5 L3 o' ^) i
- result += r"\textbackslash{}"7 k- S' Y# y7 M% w6 [ x
- elif char == "$":
$ v0 J: v4 X' ]: M4 z2 w5 K - result += r"\$"6 Z' w- W/ T3 B- v' k* b+ l
- elif char == "#":
; m- `) t' G+ ?$ _. d8 R- O- ~ - result += r"\#"
& ]! c; o& Z) ?8 N" l - elif char == "&":
$ q; p7 |8 I6 K2 k - result += r"\&": J3 H" `3 f6 H; @
- elif char == "{":
4 E9 }! {" Q ?: D7 [, e) X) M - result += r"\{"3 l) B7 X/ V; |( N# O' K" v
- elif char == "}":1 z# B0 W4 t* _( [% E9 t2 S0 M! {
- result += r"\}"
1 X+ E) M8 r. g6 N2 S4 G7 b# s - elif char == "^":
+ p, f" q$ `7 U - result += r"\^"
6 O: Y" y1 t3 G3 z, S2 A - elif char == "_":
4 U: Y; r/ M4 F9 t' k - result += r"\_"0 z8 S& }. m9 i. O% d) o
- elif char == "~":
. G, r- P5 k: V U3 N; } - result += "r\textasciitilde{}"
. Y" ]* t* b6 a( S% ] - else:8 z8 R7 I2 |. c/ ?: m G
- result += char& L7 w& g2 i* X4 S+ q' t$ I
- return result
: ^ c7 G: z, H# }# t, @ - f- i, o e' V" S
- def latexify_html(beautifulsoup_object):8 q4 F! T4 ^# \, ^% t+ k
- try:
3 Z" z* h) \7 r# h - return latexify(re.sub(r"\s{2,}", " ", beautifulsoup_object.get_text().replace("\n", " ")).strip())
# b9 W, Z9 T1 e n - except:
* ^5 \( ~# t- e$ n - return latexify(beautifulsoup_object)
* B/ B0 I& N6 @& ]: c) n2 n
" w9 u" X' f& D3 j0 z2 b# L$ m C- 4 z3 r. s7 s/ U h w4 j1 f
- class CambridgeDictionaryScraper:" {% i5 v7 Q* P. [* ]9 v& P
- """ Scraper for Cambridge Dictionary """
# D5 A& A0 X0 F/ r6 X) D5 K+ m) z - url_set = set() ## Shared by all instance2 P" z L) s# W
6 M8 }" ?* o2 q' O4 g) L( R ]- def __init__(self):$ E) [6 B+ c7 _
- for item in conn.execute("SELECT url FROM cambridge;"):; A/ }6 q5 A; k1 Z
- self.url_set.add(item[0])
$ C2 C& c2 D7 p9 Z+ U - print("Already Downloaded " + str(len(self.url_set)) + " Words!")
. e( B+ |2 h' D# i& C
7 k/ Y: f- y, X# \ s0 G- $ r+ p2 o3 L' I% R. z- d
- def __del__(self):
; b- [# O0 P0 H& b! O - conn.commit()
$ q5 v/ J |, _& y( Q$ F- A% X, o
( _& r" X0 d( f# q. U' A: x
$ B. j, U$ D1 l2 s" r% _4 v0 l- def get_word_page(self, url):7 a" G, m; j$ W; h1 Y4 [7 J9 S
- r = requests.get(url, headers=header) " d) D4 N$ s$ X* }% e2 k
- bs_obj = BeautifulSoup(r.text, "html.parser")* U$ ^; W3 t! |+ I! Y
- entry_tag = bs_obj.find("div", {"class": "entry"})* v! e* I' ~9 B; {3 V: L! e# o
- if not entry_tag:
! {) ^% [1 J. V& D) O* o* | - entry_tag = bs_obj.find("div", {"class": "di-body"})
# y8 i i. V. `% P; m - if not entry_tag:6 `2 @, Y2 x; C- Y5 T, P
- ## Beta Words
- {+ L6 _1 m4 {* l; ^! A* n - entry_tag = bs_obj.find("div", {"id": "entryContent"})4 X) F3 u: y8 g0 _! u
- if not entry_tag:
' c6 ^, J W" @% i$ x - entry_tag = bs_obj c4 p3 b& b+ z
$ n3 o( J4 ^* c [$ H' K- if not entry_tag:
2 f0 |4 |0 W$ `" S8 k' \3 P - entry_tag = bs_obj6 a3 P0 I0 U6 E5 d1 Y) Q& W
-
8 _% h6 P3 f( i - for tag in entry_tag.find_all("script"):
6 T! Q1 |. ^; _5 W9 Z( u" B* J6 o - tag.extract()
* _8 \7 R9 @5 u5 J) b6 Q: n - # R/ h% ?# ]; ]/ `5 P/ x
- result_string = str(entry_tag)* r- }) o' n. x
- return result_string
6 ^0 l2 ^0 G6 y" E1 F6 k/ B
c+ p3 Q B1 j- F+ v-
`5 x6 S; ]3 t* }* a4 e - def start(self, url):/ {; \3 n6 {4 j
- r = requests.get(url, headers=header)
1 i1 O7 |2 c* k0 U - . x3 z; T& Q( a U3 e# b. H! V
- bs_obj = BeautifulSoup(r.text, "html.parser")
; W+ h ]2 D" ^1 v! @& }3 ^6 T
7 i" \! S' [% V, d8 u4 Q- for li_tag in bs_obj.select("li.lpr-10"):1 M( n5 h: l" K6 b
- child_url = urljoin(cambridge_url, li_tag.a.attrs["href"]); V. |( f: H* d: g. f! t5 O( v
- print(child_url)& l3 ]' H/ R* k
- self.find_child_entry(child_url) y8 h) q) ]% V1 w
( O& O1 B8 H2 @4 c6 g8 d
1 l! W7 k' K; ?2 m% \: T `- def find_child_entry(self, url):8 u$ u0 @$ c) [
- r = requests.get(url, headers=header)
8 Z1 ?. H+ r9 j* Z* A& [- O" s - bs_obj = BeautifulSoup(r.text, "html.parser")
" W4 L- {+ ?+ R, T2 X) q - for li_tag in bs_obj.select("li.t-i"):9 T7 }. |" Z" w9 p: f
- child_url = urljoin(url, li_tag.a.attrs["href"]).strip()
) T: `' N& A j' i2 U1 L - child_text = li_tag.get_text().strip()
Q* X" J8 J6 F. S9 J9 [* k8 h - if "..." in child_text:. B: A5 T7 X8 p0 O' g! f0 Q
- self.find_child_entry(child_url)
2 {4 h2 V+ K4 a3 S+ G; z: g - else:, H4 u" U5 ]! ]" e
- if child_url in self.url_set:& t6 P$ ?) }! h( O- {- n2 @
- continue
2 k5 P0 y( ~& j- J0 j+ y- K - print(child_text + "\t" + child_url)
; P1 `0 B' f* M0 `6 [ Z6 V" O4 a - conn.execute("INSERT INTO cambridge (entry, url, html) VALUES (?, ?, ?);",
: _# R, p9 J/ M% b3 \/ q - (child_text, child_url, self.get_word_page(child_url)))1 M$ M! @: x7 f, e7 d v
- conn.commit(): H, S9 L4 l$ g# W- k+ Z! \
- self.url_set.add(child_url)' x: t5 h# t& b* S) ]
7 T6 |& A' j- R/ a- ' N- L; \# I3 a
- class CambridgeDictionaryExtractor():
: Z: s2 w6 [% B- m% v$ D# x - def __init__(self, entry, entry_html = ""):
, D. v2 T7 z) g. \- x# { - self.entry = latexify(entry). P2 w) ?' v8 u* M" H7 p
- self.entry_html = entry_html
6 H* p9 U4 h& c - self.result = ""
! r1 J) F b4 h7 K; I
" Z, S: o: [" t$ L5 f& C7 b- # def __del__(self):! o4 \# V; w$ ]; \
- # pass/ t7 S* w9 W; H( a! A
- ' i# m7 i1 D9 @ @
8 f$ J5 k; K0 `' Z9 x- def extract(self):
% N1 n% L$ h9 F4 J - """" V/ j' f R4 D0 U& J* [3 x
- <div class="pr idiom-block">5 A1 b. L ^( n" o7 p
- <div class="idiom-block"></div>; X/ v9 j' ^- Z% Y: x8 v$ J! J* l
- </div>
$ ^ h9 V0 y/ R7 D1 d - """" Y- w1 b$ W9 r7 L1 F; O* V
- bs_obj = BeautifulSoup(self.entry_html, "html.parser"), T2 B: F7 u) p9 I* X0 _* a: \
- self.result += "\\begin{entry}{" + self.entry + "}"% E+ w3 _% P4 F/ W/ l V
- for part_of_speech_tag in bs_obj.find_all("div", {"class": "entry-body__el"}):8 ~" D& I. Y7 Y7 p- h p3 j0 l
- self.result += "\n\\begin{Partofspeech}\n" + self.process_part_of_speech(part_of_speech_tag) + "\n\\end{Partofspeech}"
3 Q+ }1 G; Z7 |7 X3 Y - idiom_block = bs_obj.find("div", {"class": "idiom-block"})+ j H6 u0 N3 ` g6 u1 H( X
- if idiom_block:; R" O2 M: `/ `0 L( [
- for idiom_block in idiom_block.find_all("div", {"class": "idiom-block"}):
; J+ k$ u: R6 y* [2 R) O - self.result += "\n\\begin{idiom}" + self.process_idiom_block(idiom_block) + "\n\\end{idiom}"7 Q, ?# c+ O7 A% ]$ h
- self.result += "\n\\end{entry}\n\n"" d; i" S8 A; m8 h9 G8 T
- * K8 j, M5 a5 T. N1 I: Y: A3 M
- + I6 _( m! |1 p6 Q3 e8 _3 V# i, E
- def process_idiom_block(self, idiom_block):
& W6 D' z8 Z* h8 m1 p) W - result = "", h2 x# m! H! p4 F, k( e
- idiom_body = idiom_block.find("span", {"class": "idiom-body"})
2 y8 g/ j+ X" ]9 K( A - if idiom_body:1 X: U8 Y+ g5 n
- for sense_tag in idiom_body.find_all("div", {"class": "dsense"}):
J4 K* w) V, ~" e6 R1 Q! T! E" _- q - result += self.process_sense_tag(sense_tag)) ^& u8 Z' t5 h) `3 I$ o* L( I
- return result7 ~* o0 t6 H( ]
- 3 V% D+ O% ^$ ^) K- J+ p
-
; e2 b( S; X D- L: w
, j# v0 D5 e6 R# \9 l, r1 Z- def get_smart_vocabulary(self, smart_vocabulary_tag):
# B g1 E3 Q: ^* e! }+ g0 P H - result = ""; M1 g4 A/ _2 C3 Y! d+ p5 F
- for li_tag in smart_vocabulary_tag.find_all("li"):; M/ W% A9 _0 L" L5 E( y
- result += "\\smart{" + latexify_html(li_tag) + "}\n"
+ z' e" S3 S7 K - return "\n\\begin{smartvocabulary}\n" + result + "\\end{smartvocabulary}\n"% n) S" p0 ^" ]+ I% D
- ' E# r4 U" r$ ]# G7 i
- 2 c6 I, U0 W3 m$ t- l5 P
- def process_part_of_speech(self, part_of_speech_tag):+ D& Y6 o6 u# E) ^4 @3 m
- """
7 \& c0 _ _5 v3 } - <div class="entry-body__el">4 [: P. L- l0 W7 n% V
- <div class="pos-header"></div>* b+ { S% l) H) F! ^
- <div class="pos-body"></div># M2 n5 e+ D* X3 P
- <div class="pr relativDiv"></div>
* [/ C3 Q" b+ w( T# n7 T - <div>4 x0 B4 A( x2 G4 i4 e
- """
. A+ o1 B- F6 g( _ - result = "", f9 E5 Z1 `% }3 a3 S
- header_tag = part_of_speech_tag.find("div", {"class": "pos-header"})
$ A2 j' S5 ^; D: [' i0 j, R - if header_tag:
" d, l& W! c. N9 s& I - result += self.process_part_of_speech_header(header_tag)
# `. L. S0 _5 @$ ^# q' p- W' ] - body_tag = part_of_speech_tag.find("div", {"class": "pos-body"})
7 b. }( W, E( j/ L% i6 L - if body_tag:
8 U! `" J' C! f7 ]4 [5 N- e G - result += self.process_part_of_speech_body(body_tag)- i h! C, ^2 D% p4 C, T
- pv_block_tag = part_of_speech_tag.find("div", {"class": "pv-block"})
* _2 p5 F: e8 Q) t - if pv_block_tag:
; l u( ]! _# z' e1 O, o4 L# M - result += self.process_pv_block(pv_block_tag)
) k: a/ I a1 m, U - return result.strip()
; o+ S3 r5 X" s3 b+ A - 8 w: z9 g$ g$ f& ?, k6 n2 |
- * f( P/ e0 M3 F* e; p& G9 e
- def process_pv_block(self, tag):
4 O$ D: y; m* y2 j. v' ` d - """
( h$ E+ K* T3 L! J - <div class="pv-block">
+ e) v7 Z1 X; M1 T- T& W - <div class="di-title"></div>: z Y% r5 [5 v6 X
- <span clss="di-info"></span>% X1 J2 E* ]5 ~; E
- <span class="pv-body dpv-body">4 {7 _& K* c2 [$ J$ Q- M
- <div class="pr dsense dsense-noh">1 w; [( P N8 n/ |
- <span>
* x+ }- G, t0 c: D$ M - <div>4 e7 J7 y, w$ Z/ o: c, t
- """
7 Q, U2 H) [: {, } S; x9 I; I; i - result = ""
+ m4 w: z: Q1 l - for item in tag.find_all("div",{"class", "sense-body"}):
$ P, ^7 D. y4 r+ g* o! [ - result += self.process_sense_body(item)
1 z6 C! E- N% V: Y: i3 W - return result
' z/ Q0 o/ _9 n3 _8 L+ b8 r
/ V, F' T/ V7 k& H/ I. u- " P2 W) A' [* n. y
-
2 D* N" b) U* g$ s( C7 _7 s - def process_part_of_speech_header(self, header_tag):+ @* I0 y! _1 x+ ~ n, q
- result = ""
) A! [, l ~9 @+ l3 r4 h - # title_tag = header_tag.find("div", {"class": "di-title"})
5 T+ j/ _/ F& ?/ ^' q; Q+ S - # if title_tag:
: h+ L2 E( u/ q' o$ t3 \: {8 y3 W9 f, Q - # result += process_header_title(title_tag)* W7 b( ]9 H. X X
- posgram_tag = header_tag.find("div", {"class": "posgram"})
8 O7 M3 \% n7 Z- J - if posgram_tag:
4 u( S/ }9 U: m, o - result += self.process_part_of_speech_grammar(posgram_tag)
/ G7 J$ r; p* O& g - for pronunciation_tag in header_tag.find_all("span", {"class": "dpron-i"}):! R0 n B$ Z: e2 y2 b# g- ~: L
- result += self.process_pronunciation(pronunciation_tag)
' ~5 v9 N, W$ Q. P& b. S
! \6 s1 S' e3 R9 T3 ~- return result.strip()
X8 M- P. A8 I! ]! ^9 X - 7 S, I1 c4 n) E) O: G' d& R
. n; w" P9 ~% w) V& Y. M8 e- def process_header_title(self, title_tag):
5 n4 H' C. f- J6 f - ## <span class="hw dhw">record</span>
0 Q! G8 K5 e3 D y& L - result = "": w$ Y1 o/ V9 |8 A7 E% z" D
- headword_tag = title_tag.find("span", {"class": "hw"})
! \8 Z" \$ r6 v& i+ c* q J/ P - if headword_tag:8 O8 X1 d6 d, d' m. S
- result += "\\entry{" + latexify_html(headword_tag) + "}\n"
- @4 h; h1 i% Q5 e+ a0 @* E/ C! z% j - else:, B9 p; [0 E8 V- e. ]- W
- result += "\\entry{" + latexify_html(title_tag) + "}\n"
$ u" M4 w f# S3 ^ - return result
) {9 T# H0 w5 s1 ~: x
4 ]; }& O/ _* N5 n5 x- def process_part_of_speech_grammar(self, posgram_tag):" O9 C9 ~# S Y) X# a3 _1 z, K
- result = ""8 w \1 u7 L* T+ d6 B
- part_of_speech_tag = posgram_tag.find("span", {"class": "pos"})
9 r7 V* h" a: ?% e - if part_of_speech_tag:
- u* V( h3 }& l& J* E; E - result += "\n\\pos{" + latexify_html(part_of_speech_tag) + "}"
/ H8 h5 o; y+ j6 [* r - gram_tag = posgram_tag.find("span", {"class": "gc"})
! @6 ]3 t" t( X( v0 x6 C - if gram_tag:5 t: R8 k: n, e! N& _
- result += "\n\\posgram{" + latexify_html(gram_tag) + "}"$ u7 `! S/ N8 I% p
- return result/ J2 ~4 i! l6 I& f% S
- - N! x; z: i: }) {& v8 s# p, Y0 Z
- def process_pronunciation(self, pronunciation_tag):. P( z% P" Y- j% i1 x: X6 g* H
- is_us_pronunciation = False
) ^# }# h4 i5 k1 f% h, `0 X - if "us" in pronunciation_tag.attrs["class"]:
8 ]+ u( O$ I# B - is_us_pronunciation = True" u. z9 P& t1 d7 t$ v
- result = ""6 h7 e$ I9 @5 Y e |7 B. l
- audio_tag = pronunciation_tag.find("source", {"type": "audio/mpeg"})4 W/ [" S5 `* U9 X& R, Z8 u9 P8 ^
-
5 ^ `5 s* C1 L% T" W' y) @ - ipa_tag = pronunciation_tag.find("span", {"class": "ipa"})
, ~2 ~' o. F% N. v& x3 i% } - if ipa_tag:( |% }1 q7 M- x0 ]: b
- if is_us_pronunciation:
. D V) W2 N6 Q* E- v: x - result += "\n\\ipaus{" + latexify_html(ipa_tag) + "}"
1 B) C+ W* P/ p) f: W5 v - else:. ~( R- r8 ~: X5 Y
- result += "\n\\ipauk{" + latexify_html(ipa_tag) + "}"
* a( k) ?$ O6 [. T/ l8 V) o( S/ y - if audio_tag: H; x# t5 Q1 P" o, J+ i- v% [
- audio_url = urljoin("https://dictionary.cambridge.org/", audio_tag.attrs["src"])* e1 j% p" y; s6 Q. ?
- result += "\n\pronuniation{" + audio_url + "}"# D" i, W# p$ i& x5 N# Y. i) v
- return result
0 O5 K7 b- z2 T - 1 e/ i* g7 b5 k5 @. ~/ O- p; c
G! j$ Y; D M, r& z6 @8 P* y R3 F
4 t/ y1 \( I- S0 k- def process_sense_head(self, tag):0 r2 d' g+ V6 V- v9 G3 _
- text = latexify_html(tag)
8 i9 c: Y- K/ L) A( _2 F I% p - if "(" in text:
' @# O5 h% ?+ h9 t' F6 y7 x - left_bracket_index = text.index("(")
4 d* H6 N' S d/ M# X - else:* f' h6 ^5 }5 y0 q+ V) j3 N
- left_bracket_index = 0) |1 R& B% P4 g; z8 P9 I: W
- if ")" in text:
& g; L6 A& S% | - right_bracket_index = text.index(")")) C% W0 k* C, e* I, v
- else:
" \1 {6 G2 U! l7 Y# c! H; e. y - right_bracket_index = len(text)( k8 r5 r3 j3 I5 |5 l7 P2 V
- return "\n\\shortmeaning{" + text[left_bracket_index + 1: right_bracket_index].strip() + "}"
" r! U" R' }2 J9 ]0 o
4 s& P& Q+ Z( A; d1 ]3 @0 l0 j+ U
( O1 v! z8 I7 p0 q9 f- def get_definition(self, tag):
) q- n6 k& r: C) z - result = "") t; ~5 B! j1 I: G" W# U* l
- for def_info_tag in tag.select("span.def-into"):+ O5 V/ G# t% J, J- N* P/ i
- result += latexify_html(def_info_tag)9 ~' O0 h) i6 [4 _: e1 r
- for def_tag in tag.select("div.def"):
. @( { o! L [ - result += latexify_html(def_tag)% t% z4 [( b: B" `/ l& i" m
- return result
# |# k: L& z9 k$ f+ ~/ u
/ ]$ w+ b/ S& z9 y- 0 I/ L; s* Z* Y% S) w; \9 l
- def process_def_block(self, tag):
7 m. o% a2 `2 f4 a* k p7 i - result = ""
- o; A. C `' i$ K - def_tag = tag.find("div", {"class": "ddef_h"}): q3 m7 _/ H; ]' ]3 f8 U
- if def_tag:4 Y y" h6 V' f, m1 s1 A' Z
- result += "\n\\meaningen{" + self.get_definition(def_tag) + "}"
9 W6 L* @. o4 }! w a, f* M - try:3 g( F8 _) l6 \, e) I
- def_trans_tag = tag.select("span.trans")[0]2 [% }- D6 S4 C7 v" O' W9 H
- if def_trans_tag:" Y! n% x! w0 C) t- {' e; F) P3 y
- result += "\n\\meaningcn{" + latexify_html(def_trans_tag) + "}". p/ u3 n! @* S% n
- except: |( S; Z, {& D* ~9 K _
- pass
8 A) i P/ \! l% X, u5 G/ Z3 t# \ - for example_tag in tag.select("span.eg"):
( S# z3 l: m! g0 n" U! c# P% n - result += "\n\\example{" + latexify_html(example_tag) + "}"
3 j& r' ^. C' U+ L/ K: L4 O - return result% ]* E. `: [3 V+ j8 P
- " h) y% u6 P, f$ u6 G5 {+ D* N9 a
9 W" r. g+ j6 s. `- def process_phrase_block(self, phrase_block_tag):
9 ~6 B2 u2 x4 c - """; _+ v J* \' ?
- <div class="phrase-head dphrase_h">...</div>
/ F2 { U n7 ~3 z1 p - <div class="phrase-body dphrase_b">...</div>- Y4 R) V+ ?2 O) x: e
- <div class="bb hax">...</div>
$ ]2 k* v0 K7 v8 V. X - """
3 b) v' _, | ~1 W - result = "\\begin{phrase}{" `! F! u+ _$ ?& B6 T- ?8 g) S
- result += phrase_block_tag.select("span.phrase-title")[0].get_text().strip() + "}"
B0 ?4 e3 N+ C$ ] - return result + "\\end{pharse}\n"
5 w- ^5 S. A& S1 Z# k. G
' l3 @# g. ]6 H# w9 G- def process_sense_body(self, tag):) ^1 P" D n, X( v# E
- """
1 ^; C8 z* p k0 z2 } - <div class="pr phrase-block dphrase-block">...</div>; z5 I" `6 B9 p$ K- K
- <div class="def-block ddef_block">...</div>
$ s- @6 f& |) f- R' D - <div class="bb hax">...</div>$ W1 M( i. z. S. a0 c: ?
- """9 a7 ~% ]) v4 a
- result = ""
8 t+ y* X, g, F1 S o1 o% i - for def_block in tag.select("div.def-block"):
# {8 P2 k- i/ x, j4 r - result += self.process_def_block(def_block)4 ? w# J7 B( D
- for phrase_block in tag.select("div.pharse-block"):8 f! P* x% C2 p/ @
- result += self.process_phrase_block(phrase_block)4 t$ H) ?# ^( ?. f
- return result$ f3 f- e& q& {) Q$ ~( }* j
- 9 R2 N5 M1 w" O$ h
- def process_sense_tag(self, sense_tag):
. o' w6 p+ h+ f+ w- \( j - """* V& _$ _* H* a6 J
- <h3 class="dsense_h">...</h3>
4 P6 L1 Z/ {, D( | - <div class="sense-body dsense_b">...</div>& D2 Y* @7 _: i3 g. [8 V% V% L
- <div class="smartt daccord">...</div> # Smart Vocabulary0 j" [# y3 T8 p) x( h
- <div class="bb hax">...</div>3 k- B7 r( |' c3 g
- """0 W6 @5 C" P7 ~
- result = ""1 R9 l& H C$ R; J! @, e$ B
- sense_head_tag = sense_tag.find("h3", {"class": "dsense_h"})
5 K1 ^+ z% v, s# ]9 k, \/ J - if sense_head_tag:
" l3 P3 Z0 ^ ?- i: @) {/ v - result += self.process_sense_head(sense_head_tag), v' l( O) j- O* ?- Y M
- for sense_body_tag in sense_tag.select("div.sense-body"):
! j" f$ | |% x! D - result += self.process_sense_body(sense_body_tag)) R! m- S+ f$ v/ R8 R
- for smart_vocabulary_tag in sense_tag.find_all("div", {"class": "smartt"}):# {) O4 D1 M) l
- result += self.get_smart_vocabulary(smart_vocabulary_tag)
. }5 B" { d$ h1 t# l4 i+ o -
( M8 x4 `* Q+ W- \' b9 } - return result# k; U0 U8 H. G" @- L0 H' F' b; z
- # {5 X9 m# T+ q" f7 P9 t
- def process_part_of_speech_body(self, body_tag):
! U7 W# k! f# L* Y( \ - """
5 S. N) O2 r) r$ G X - <div class="pr dsense">...</div>
" P2 h* q' u! k" s - <div class="pr dsense">...</div>
7 q7 {- ^7 P3 N" Q9 x9 z% e4 U) | - """
% z* ^9 h* A2 F& R - result = ""
3 X4 s! z6 r4 c5 f- k - for sense_tag in body_tag.select("div.dsense"):2 _' z/ \+ m) X C4 p
- result += self.process_sense_tag(sense_tag)
6 b# l1 _9 s* K0 T - return result
% r6 f6 d, @' s -
7 w1 W/ L; u7 L* Z4 p
! a: o& S& _, T: N W* p% s' D+ K- if __name__ == "__main__":
; z, i2 q: s0 o - string = ""
" Z1 I$ a0 r) a( m$ n2 H1 v! } - CambridgeDictionaryScraper().start(cambridge_english_chinese_url)* w/ V8 }/ R- J8 M: O9 s% e2 j
- for row in conn.execute("SELECT entry, url, html FROM cambridge;"):
8 y) T1 @# |1 N- h - entry = row[0]' ]1 K: z, w. Y2 {) m
- print(entry)
/ T/ N% w: G; M6 S1 Y, o. A - url = row[1]+ G) w# _8 I6 a5 m) y
- html = row[2] H' K% s! E. {4 {1 c8 Z
- record = CambridgeDictionaryExtractor(entry, entry_html=html)
' e! V1 c# ~0 I' y - record.extract()
$ A) w& V' Q' ?, o - string += record.result + "\n"
0 }$ g9 Q5 Q/ x - #print(record.result)' x% v8 q# q7 [& I! j* ]5 H
- y6 V! w* C) @0 k4 ~% O5 q, Z
- with open("./final.tex", "w", encoding="utf-8") as f:' w1 [$ O _& D
- try:9 U! l& G1 N0 `+ `8 F
- for char in ascii_lowercase:
3 F$ B$ S7 D3 O ]+ w4 O2 j* ` - string = string.replace("\\begin{entry}{" + char, "\\section{" + char + "}\n\n\\begin{entry}{" + char, 1)
p4 K1 l. _5 }% H) U# Q - except:
) I( w7 F* _+ ]+ u - pass! ]) B5 ~0 c6 f$ h. K
-
% m/ H( l8 {) n& J' h6 g - f.write(string.replace("", ""))
2 H) ~% M2 j% N6 t" t" O5 N5 d - 9 { V% i* Z9 s0 Z/ B3 j* z
- 4 A: b1 k r# \7 t; Z
-
复制代码 |
|