TA的每日心情 | 开心 2021-1-4 22:53 |
---|
签到天数: 5 天 [LV.2]偶尔看看I
|
发表于 2021-1-4 02:18:57
|
显示全部楼层
- #!/usr/bin/env python3
* `- s& f( ]( ]* W3 c! J' A - # -*- coding: utf-8 -*-+ f$ T& Z0 v9 i; v) q: L7 P3 R
7 K- }, q; X) O2 N& [2 V. J- import os
2 W) L: ], y6 @/ E - import requests
3 f- {$ t1 Z, j+ j - import re+ J$ q- |- q) A7 ^) N9 w
- import time( b5 z! z% V" ^7 Y
- import sqlite3
9 W% s% Q9 S6 s* `6 M; R) r6 h4 F
$ c2 R! k/ T3 @$ f- from string import ascii_lowercase
$ b) Y1 V( L# h; i$ u0 B5 J4 h - ( d+ N- A: J8 ]$ [2 n4 ~" d) i
- from pathlib import Path
6 ?' @; X5 S7 d* x- ? - from urllib.parse import urljoin1 \9 W. B% g' g" A2 O. `
- from html.parser import HTMLParser
! N5 \: U$ V+ @+ O$ f9 a' { - from bs4 import BeautifulSoup, W& |# w! ?, b1 {# J3 p! d
: ?$ q* i3 ~! |7 d1 h6 Z, X- webster_url = "https://www.merriam-webster.com/browse/dictionary/"# Q S- f5 F4 k2 p$ v8 I
- oxford_url = "https://www.oxfordlearnersdictionaries.com/browse/english/"
2 g' L0 t6 W/ N - macmillan_url = "https://www.macmillandictionary.com/browse/british/"( p% v d5 Y! M$ d2 i/ Y
- cambridge_url ="https://dictionary.cambridge.org/browse/english/"
, q# U; B2 U, H( z
/ E( n( U' ^# ]* C$ O- cambridge_english_chinese_url = "https://dictionary.cambridge.org/browse/english-chinese-simplified/"
6 o Z9 M$ `2 a" N8 G
' w; H& A, B! g- Q3 O b) K- base_url = "https://dictionary.cambridge.org/search/direct/"5 O! _# P+ |+ i- a1 X
- header = {"User-Agent": "Chrome/87.0.4280.88 Safari/537.36"}
1 `7 D( z8 K4 s- B3 ~) q - payload = {'datasetsearch': 'english-chinese-simplified', 'q': ""}) O( a! i$ S. q4 r7 c+ h+ p& }
- 2 B; z" l: e* e$ n
- conn = sqlite3.connect("Cambridge Advanced Learner's English-Chinese Dictionary.db")0 I/ A" |) g- ~ V# k4 s, Y4 t; ~
/ l4 n, }2 j- B4 \1 k! W2 k- conn.execute("""
t4 j" |7 B: \2 ] - CREATE TABLE IF NOT EXISTS cambridge (: a& _% c T9 v' Y- i
- entry TEXT PRIMARY KEY' \7 e* E2 _+ A6 w
- UNIQUE) _" ~* Y+ ]- @2 y
- NOT NULL,- O0 T; ^& X" I7 N" z: u7 ~$ G( c
- url TEXT NOT NULL,
8 ?8 N& U; K# h' f, n+ w) ^ - html TEXT NOT NULL,
# ~: p+ p9 T* \5 W. Q$ Y* G. g - latex TEXT1 i# D2 C% e6 t4 B) M! {4 u
- );
$ U \1 \- c# ^ o3 ] - """)
$ l: K! F% \4 q
+ X) H- _7 k7 U. b5 p- conn.commit()( }" X; n, x5 O3 z
- ) P( c I# d6 G$ K$ B
4 w7 J+ J, S E i4 O2 M- def latexify(string):( T4 Y9 e% w# [! n) ~' ]0 t
- result = ""
# D L/ Z, i' u - trimmed_string = re.sub(r"\s{2,}", " ", string)2 \) S" ]. m6 h) Q! w) u
- for char in re.sub(r"\s+([^\w'"({[])", r"\1", trimmed_string):
) n% k9 [1 g9 k - if char == "%":: p7 t6 b. c+ l% F) ~1 F4 g
- result += r"\%"
6 c4 k0 A; H: U# a - elif char == "\":
3 { k3 v' N J" L- B7 H6 P - result += r"\textbackslash{}") }% G7 u% h) I7 O0 X4 }' j
- elif char == "$": o, K- ?3 w- c
- result += r"\$"
7 z8 G. [9 \' n - elif char == "#":
2 X5 _' q7 y% v$ s - result += r"\#"
9 ]5 }8 D, O% o( ~ - elif char == "&":
9 Q$ {( w/ G% O - result += r"\&". S( @; W' q } B; c
- elif char == "{":
) b) N% C- K6 e$ H9 r - result += r"\{"
& d- t7 ]8 |; F% n. L% s! Q, h8 ` - elif char == "}":, I6 q, K" }6 N: X) \2 L/ V
- result += r"\}"; \( e; \8 g4 d1 c8 I
- elif char == "^":- L6 Q5 N$ P: `; l/ v1 T! F
- result += r"\^"
) z5 B$ j" t! Y3 ?* H. L) ] - elif char == "_":
8 B% m5 l6 ~3 H' o- n, u0 s - result += r"\_"3 D" `' V/ E/ H7 [- b$ V
- elif char == "~":1 O0 {% w4 x0 P
- result += "r\textasciitilde{}"9 Z) S, w1 P+ C: Z& y- J' H
- else:1 E0 Z8 Q" m r: ^- T
- result += char
. ], L( i2 a5 ]* R* Y6 u - return result2 e4 t6 ?8 ^' T; |. N& G; J5 ~
, Q: ]3 i9 X V- def latexify_html(beautifulsoup_object):
7 Y( E9 |5 {) M4 F, I/ t - try:
2 N, `/ }; N5 i8 W& y. l6 e - return latexify(re.sub(r"\s{2,}", " ", beautifulsoup_object.get_text().replace("\n", " ")).strip())
% l$ v J/ L9 [5 F$ m5 k. q2 C - except:
- \( `2 c$ ]4 y$ C9 u7 ] - return latexify(beautifulsoup_object)
. W) h i! X' ~7 D+ H- l# H! b
2 z4 x/ f* Z1 V {
5 Z8 H, g' Z; |- R+ k- class CambridgeDictionaryScraper:1 N: C6 p$ q( i& V3 ^9 n
- """ Scraper for Cambridge Dictionary """
+ e: T0 @: v1 f - url_set = set() ## Shared by all instance
9 U4 C( \: f5 I j, Y6 S- \3 Z ~
" S U$ j+ T; Q- B# e8 Z7 S1 T- def __init__(self):
% F" q* R p3 f: U& o/ h/ Y t5 U& M7 ` - for item in conn.execute("SELECT url FROM cambridge;"):* M3 R4 x( @( P) V. E
- self.url_set.add(item[0]) K* j+ |7 W* ~1 J1 n7 [
- print("Already Downloaded " + str(len(self.url_set)) + " Words!"). `0 @- r: F. K" j) L3 ?! [
+ f" O$ p$ O" ?0 U- O! _
( B! }5 r* p8 g) g- def __del__(self):
/ ?) Q5 j/ r, L) S4 l - conn.commit(); y$ ]- i; g2 r9 m* ^6 l
- , k( T5 H+ ^5 S1 Z
- x5 j3 T% f) I# L- def get_word_page(self, url):1 O$ H# I& }0 A0 x ]% P
- r = requests.get(url, headers=header)
2 Y! \" T0 C' d P9 D0 k - bs_obj = BeautifulSoup(r.text, "html.parser")+ s. v" Z! a) Z3 B
- entry_tag = bs_obj.find("div", {"class": "entry"})
, [5 B. _- m6 w8 l7 B - if not entry_tag:
# b) o" n+ h, {; X: s - entry_tag = bs_obj.find("div", {"class": "di-body"})
' E, U$ ^- T7 ^ - if not entry_tag:0 E Q( E& d) H8 i
- ## Beta Words
9 S1 l3 i$ Z' P" ~9 y, S - entry_tag = bs_obj.find("div", {"id": "entryContent"})2 T, H2 _" n% f% B
- if not entry_tag:
~3 F$ j+ G/ `" V3 m - entry_tag = bs_obj
1 R. _9 @5 o+ K- F! m5 y
8 O" \0 D6 }8 H0 K: ]7 x- if not entry_tag:
6 K V. M/ n$ b$ T6 a# t - entry_tag = bs_obj* n2 ~. Z& C# }) I5 I- M5 C
- 3 e( V4 r5 c9 z5 }9 \! [
- for tag in entry_tag.find_all("script"):' [. z5 r9 u* t9 ~, d
- tag.extract(), _7 a J* Q- F1 j9 @
-
) j# v6 @9 `# q1 z& Z; q, R) d1 n( j - result_string = str(entry_tag)
- H! A6 E- Z% B! ~2 c' V# M. e - return result_string, ^+ e/ ~& ?# `5 w0 K9 q
& f- ^9 s$ l+ n0 P5 x5 y-
' i$ h$ o5 h) a7 A3 R. k" Q/ I1 G - def start(self, url):9 x9 Z) o6 g# p# P( P
- r = requests.get(url, headers=header)
. c/ n1 Q* ]( P
! l4 ~$ I+ m: J2 H) ~- bs_obj = BeautifulSoup(r.text, "html.parser")" W' j/ @% Y+ C( C
- ( f& T' L% ~* y9 }: n3 R
- for li_tag in bs_obj.select("li.lpr-10"):
; z$ E9 G( q" Q, l2 Q: d - child_url = urljoin(cambridge_url, li_tag.a.attrs["href"])
7 K3 F0 ?! f% {6 X# w- N - print(child_url)
2 t$ J: k d5 Z- a" V t, { - self.find_child_entry(child_url)
+ V2 n3 u0 r5 u! s
2 T/ [2 u, G" J. x8 n4 X. L- % c! }) a! T, g# l* s
- def find_child_entry(self, url):
8 [/ q+ {8 N4 B- s+ Y! O - r = requests.get(url, headers=header)
! {' b$ I5 L1 C! |; Y - bs_obj = BeautifulSoup(r.text, "html.parser")
: B' W" l. T6 z m$ g& O& g - for li_tag in bs_obj.select("li.t-i"):
L. {& w: j. s: g1 ~ - child_url = urljoin(url, li_tag.a.attrs["href"]).strip()5 U; j) ]! G! r2 H! \1 I
- child_text = li_tag.get_text().strip()
3 g/ C8 v4 q: q& E5 ]& K - if "..." in child_text:
5 F9 P1 k" @4 Z% R - self.find_child_entry(child_url)
& H" z+ J3 s2 Q, }8 W; ~: v - else:
2 g" o) E6 i- w3 G- y - if child_url in self.url_set:" R, G: i/ s# n% i6 v/ f& O2 t
- continue
: `, F5 y0 h; G, y - print(child_text + "\t" + child_url)
7 g% R" B% C' i - conn.execute("INSERT INTO cambridge (entry, url, html) VALUES (?, ?, ?);",
' r! M( w s3 b: P/ ~- R - (child_text, child_url, self.get_word_page(child_url)))- K# K6 ^6 M5 i. n% W
- conn.commit()- ^4 J. U% D* D* W
- self.url_set.add(child_url)
6 b0 y2 Q' G+ q% v! q/ ~& N - / p8 H* u9 X. |6 t) d1 s5 d
- ; t' M% Z2 s1 k% ^# D( ~# r* H6 q
- class CambridgeDictionaryExtractor():
7 w' j/ X) d4 v/ w( o# S l - def __init__(self, entry, entry_html = ""):) s. z+ [9 d( v
- self.entry = latexify(entry)
. I7 ~5 c6 k/ S' H1 Q# f/ L - self.entry_html = entry_html D( {$ V q7 h! P" D) J4 ~
- self.result = """ L4 i. e5 R3 I" [5 {
; R* b# `- i- t: T: W& S- # def __del__(self):
( e X. i- Z3 m6 H - # pass
( Z7 B/ h8 d* K: e; Z0 s
# K0 L$ [4 A5 H1 ]9 A% N
* _& S2 ~4 I% g2 ~5 {- def extract(self):6 @& A0 Q: p, \
- """
+ `) Y) `( e6 W4 @4 Z0 Z9 y - <div class="pr idiom-block">0 @' t# q7 ~% f H* L+ m3 R
- <div class="idiom-block"></div>7 c2 A; Z# V+ k( U$ N
- </div>+ Q9 B8 ~* P9 t F2 V9 m E
- """' w" [, |! A$ j: h. |% c
- bs_obj = BeautifulSoup(self.entry_html, "html.parser")
4 Y; `9 o( @+ b& d) E3 U) V' ~ - self.result += "\\begin{entry}{" + self.entry + "}"
; }# Q: Y( ~! V: | - for part_of_speech_tag in bs_obj.find_all("div", {"class": "entry-body__el"}):) _, B" {1 v' j3 O& |! s8 o2 R
- self.result += "\n\\begin{Partofspeech}\n" + self.process_part_of_speech(part_of_speech_tag) + "\n\\end{Partofspeech}"
8 Z5 |" A3 }7 ~ - idiom_block = bs_obj.find("div", {"class": "idiom-block"})
0 h7 ]6 I# X% I8 X1 @' {2 ]" V - if idiom_block:
; f# v5 o7 A p4 G' t5 Y( M' P - for idiom_block in idiom_block.find_all("div", {"class": "idiom-block"}):
) H7 ^+ `) \8 I( d - self.result += "\n\\begin{idiom}" + self.process_idiom_block(idiom_block) + "\n\\end{idiom}"4 [7 z f4 ]% _4 x0 Y0 {
- self.result += "\n\\end{entry}\n\n"" Z: L( L( x2 {, }3 ?
- 7 r; U( b% A8 ]2 d+ d
- 3 h. r% ?4 k4 C+ d7 x
- def process_idiom_block(self, idiom_block):
0 c! ~9 ?1 w$ |: `" { - result = ""
# c* Z1 Q5 g7 p' \/ H& C+ z - idiom_body = idiom_block.find("span", {"class": "idiom-body"})* Q# H. d8 A' c7 I' B
- if idiom_body:
8 n! E/ v0 Y# P5 m6 `! \ - for sense_tag in idiom_body.find_all("div", {"class": "dsense"}):( K3 H8 _9 Z' n( p+ T
- result += self.process_sense_tag(sense_tag)
" K. j9 q3 b& K. \; d - return result4 g' F7 k& e% c3 \
- 6 ~4 ~" G$ S* r8 [. ?$ K
-
; ?5 K/ ~! T4 F - ' [2 p y% P0 i ]/ A1 Z! c- g" c
- def get_smart_vocabulary(self, smart_vocabulary_tag):
" O' Q9 U( Q$ v, a0 j - result = ""
; S# P( I4 Z9 r8 v: Q - for li_tag in smart_vocabulary_tag.find_all("li"):! P& c# L8 ]* ?
- result += "\\smart{" + latexify_html(li_tag) + "}\n"2 B) X) e+ o3 \2 C, Y: U" W5 H
- return "\n\\begin{smartvocabulary}\n" + result + "\\end{smartvocabulary}\n": U `$ B8 b% h* }4 G. J& F8 [
- ) E: g+ b! a5 D- n% ^! u
# n" S- z5 k/ u! J3 ?2 d, w0 @- def process_part_of_speech(self, part_of_speech_tag):* o! _9 o: K: P# T6 `
- """
+ {/ I$ t. \0 W1 V& b8 _3 k - <div class="entry-body__el">6 q& ]0 B. s/ v
- <div class="pos-header"></div>
: s- h; k( @9 |7 Q: E - <div class="pos-body"></div>
/ c' w I1 G( \( q! H. Q0 e - <div class="pr relativDiv"></div>
2 C' R0 i; x. w+ I: s2 T3 n' y - <div>
" K! T- E) ~! Z E ?+ S% K! y - """- C* J3 n* c5 |2 Z& m5 I3 v; d
- result = ""0 n. A8 x2 l% ?5 d* E$ F
- header_tag = part_of_speech_tag.find("div", {"class": "pos-header"})
) |$ K. b6 n; L& P1 i - if header_tag:7 v! _% e$ O. f0 M! d
- result += self.process_part_of_speech_header(header_tag)
1 o F( G& [$ ]$ i - body_tag = part_of_speech_tag.find("div", {"class": "pos-body"})
' g/ u5 K% n9 D* Z! ]3 Q& q - if body_tag:
5 H' h# K( i- ]0 K& \4 `. Q4 O - result += self.process_part_of_speech_body(body_tag)# x3 u& G! ^ s9 ]/ Z
- pv_block_tag = part_of_speech_tag.find("div", {"class": "pv-block"})/ W& v# {9 x& Y, _* A4 q: \
- if pv_block_tag:
' E M6 F" }$ N/ H A - result += self.process_pv_block(pv_block_tag)0 o; h, O; V( E! T2 l$ h. D
- return result.strip()
* u \" u, N8 z0 c. O: e- ~& J
( g# J" T* d8 {" l( p/ k-
) Z# I3 Y( D: t* z: U - def process_pv_block(self, tag):/ I: X3 k& S$ V
- """7 A/ u% q; I9 _) t* P, ~1 R- ?5 Q
- <div class="pv-block">9 J7 A7 s" ^8 U! {% D
- <div class="di-title"></div>
, Y5 C, w7 `2 r; w( G - <span clss="di-info"></span>
: c) H5 z: x2 x* e) p3 V$ e V) ^ - <span class="pv-body dpv-body">
& h: ]8 A8 S1 G& T- I - <div class="pr dsense dsense-noh">
/ Q, N$ U. Z, ^ L - <span>) H1 d9 }, w- V. i+ \; K. c
- <div>
5 f( G6 w! D: `- W- C - """1 ^4 g5 G. C' [- r2 y n O K. W
- result = ""
5 r8 m' k4 [+ e - for item in tag.find_all("div",{"class", "sense-body"}):* H9 _) m6 _6 G1 d6 P1 _
- result += self.process_sense_body(item)- b% S5 A- w6 g8 v% _$ m+ x' b
- return result
' e+ g8 _5 f8 m" } - ' q+ b8 c! q) ^& Y
- 3 [, S) \& M9 D3 q( _. Q8 @
-
0 e" D7 L2 U) o3 A$ u - def process_part_of_speech_header(self, header_tag):
4 E1 I* x3 }$ c9 r3 G, k0 T - result = ""2 d. ?2 o4 o r: x& a
- # title_tag = header_tag.find("div", {"class": "di-title"})
; Q# L' O( @: G4 H - # if title_tag:
k) r' ]! D6 `- F2 v/ J$ g2 T - # result += process_header_title(title_tag)
/ E; j5 n {' G" O- o1 _+ C - posgram_tag = header_tag.find("div", {"class": "posgram"})
' W5 t9 U. H9 a5 k - if posgram_tag:
6 u/ g" t. @. v) K - result += self.process_part_of_speech_grammar(posgram_tag)
, f, ^/ Q" O, n% u2 G5 h0 N3 R7 G - for pronunciation_tag in header_tag.find_all("span", {"class": "dpron-i"}):
, ~! B1 h1 a& A - result += self.process_pronunciation(pronunciation_tag)
# o9 U& {& ^) z- H a8 d - 9 }- |9 x# A! w! T3 `9 g$ b5 v! h, j
- return result.strip()
+ B( E8 r; S" T! {) w, \5 R - ' P/ o3 n% S8 I; ?5 I
- o; Y4 {2 |% K* V% x: E- def process_header_title(self, title_tag):
* ]" M& R2 c, T) h# ~# ? - ## <span class="hw dhw">record</span>( M% O. t( Z) @! X4 T3 M
- result = """ J$ ~8 @6 L! |/ `2 j9 m
- headword_tag = title_tag.find("span", {"class": "hw"})! @, [/ y! D/ G, Z# Z/ N
- if headword_tag:
$ x% X8 i# b% `9 D5 j - result += "\\entry{" + latexify_html(headword_tag) + "}\n"
+ o* b' }! @' m3 E - else:
4 F' s) N1 {, v - result += "\\entry{" + latexify_html(title_tag) + "}\n"
! q) X0 V& f# j& l) v8 P1 m - return result" z+ k7 l% ~" G x0 Z
) n) d3 h9 m2 I- def process_part_of_speech_grammar(self, posgram_tag):
1 n2 v. J2 W$ ]! o - result = ""1 @6 |5 _' G6 m* d' u
- part_of_speech_tag = posgram_tag.find("span", {"class": "pos"})
( e3 R. X w& C" q* w4 e - if part_of_speech_tag:: S$ B, k; J' }* W
- result += "\n\\pos{" + latexify_html(part_of_speech_tag) + "}"3 v# B& q* W1 I/ T' E
- gram_tag = posgram_tag.find("span", {"class": "gc"})- `0 z5 G5 p& R. L6 _
- if gram_tag:
; g! X6 l' y* }2 L) P+ e" k- ^0 u - result += "\n\\posgram{" + latexify_html(gram_tag) + "}"# x6 A0 ]- _ X. Y- j6 E4 }0 Z, ?
- return result
. [) L# X3 Y2 J! C - . F) j0 M7 P$ P5 |' e1 q
- def process_pronunciation(self, pronunciation_tag):$ N, z+ O' p4 t4 Z8 S5 l/ t9 M4 d
- is_us_pronunciation = False- ]( d- \$ c, h# P9 O/ k
- if "us" in pronunciation_tag.attrs["class"]:
I7 y4 ~- N& {, H0 ^ - is_us_pronunciation = True
! m y. ?6 i' c8 F, ]( b6 K. P - result = ""
4 U& \& X5 o6 }0 m - audio_tag = pronunciation_tag.find("source", {"type": "audio/mpeg"})
& d) s2 h _# b& N. J# } -
6 l2 X* M o) W, m - ipa_tag = pronunciation_tag.find("span", {"class": "ipa"})
' p; k$ v: j1 O4 v7 x c - if ipa_tag:
$ ~6 o0 p+ A" A F% p, p! z) _7 s - if is_us_pronunciation:1 s6 N$ x* H8 U( D$ E
- result += "\n\\ipaus{" + latexify_html(ipa_tag) + "}"
^3 m/ {6 D9 |: Q - else:
4 y- e6 m9 q1 ~) I - result += "\n\\ipauk{" + latexify_html(ipa_tag) + "}"3 ^7 _1 X: m. D+ k9 `# v. ]' L
- if audio_tag:8 Z! P; j, l1 b9 K- P2 Q
- audio_url = urljoin("https://dictionary.cambridge.org/", audio_tag.attrs["src"])5 K: W' @0 [0 g
- result += "\n\pronuniation{" + audio_url + "}"0 i+ y3 c& [- E/ h
- return result
" @- j+ k) O1 m+ _
" F2 i8 u" Y& r; Z, v
3 M- Z* {7 k4 u( b' ]6 i, e4 z- . y, R; ^( Z( i& Q# U
- def process_sense_head(self, tag):, _; e/ `9 @( D! ~
- text = latexify_html(tag); \6 |. l. L) g
- if "(" in text:
H n4 x7 i9 s6 }% Q" A2 V% Y - left_bracket_index = text.index("(")( i9 C8 ^* e2 T
- else:
* j, C6 C7 P& l/ b; t6 e - left_bracket_index = 00 h h/ S+ ^. D
- if ")" in text:
6 l, l. ^2 E6 b7 O( h - right_bracket_index = text.index(")")* h1 C) n& c( U. Z" l' |
- else:
& s6 J' k; \1 L! o" c# I - right_bracket_index = len(text)) A) U4 M, e! P t7 z( H+ E
- return "\n\\shortmeaning{" + text[left_bracket_index + 1: right_bracket_index].strip() + "}": g$ F7 O9 s9 b
- 5 o, K7 ~, J) g* l, H8 g
- & p/ l# o& J9 U" {; }3 q/ u9 p: x
- def get_definition(self, tag):9 s3 T$ ~5 W1 N4 V4 b
- result = ""
2 t# r# q) P9 v9 j$ d) l - for def_info_tag in tag.select("span.def-into"):
( {) k e5 Q4 B; A/ Y3 u - result += latexify_html(def_info_tag)8 M! G0 K$ V, J8 I0 ?( p
- for def_tag in tag.select("div.def"):
5 D/ D5 b7 j, A. ] d - result += latexify_html(def_tag)
" E5 J+ H/ }* y- v0 m/ ` - return result
0 k- z' X8 M+ k) L, [ - 9 a+ n5 W7 b- W ?# z
/ h5 s6 h. ~4 K- def process_def_block(self, tag):3 K3 {* @% q/ S: |8 G% p
- result = ""
' d+ q/ d) {8 w! ^. I$ c- b - def_tag = tag.find("div", {"class": "ddef_h"})9 I1 }5 V- ~+ B) n# y4 G
- if def_tag:
) K# a7 c% Q; Z% S6 N - result += "\n\\meaningen{" + self.get_definition(def_tag) + "}"- u; G& p" l& O E/ B4 {+ Z0 R
- try:( N; l# U3 [: y* b- r
- def_trans_tag = tag.select("span.trans")[0]$ a4 x( y; Y; U8 G
- if def_trans_tag:" l+ `' o* Q/ D: h% A2 }' ?1 [8 r
- result += "\n\\meaningcn{" + latexify_html(def_trans_tag) + "}"& x. S4 J* k; u, u& e% J. l9 {
- except:4 a! _7 ~4 q* ? E9 z+ l* D
- pass
; i: H1 a9 G: {1 m - for example_tag in tag.select("span.eg"):
+ n, g1 O3 I' Y! M$ P; q& e - result += "\n\\example{" + latexify_html(example_tag) + "}"/ E" s2 I1 t& r# N8 |
- return result' M: J2 l. P$ v% b
- / k& U* f9 m* R* s8 T8 d3 U
/ X, ]1 o" K& [. _- def process_phrase_block(self, phrase_block_tag):" h. k1 |/ `+ R9 E- z
- """
0 z* t/ \ d/ N1 _2 z/ O - <div class="phrase-head dphrase_h">...</div>
9 E" t f, P5 _& L - <div class="phrase-body dphrase_b">...</div>
+ m0 k* y/ F2 e! j7 p) z5 m - <div class="bb hax">...</div>$ X. w; Q$ B# m' R/ U
- """
) H: R- v. O' P. C$ k) a6 g - result = "\\begin{phrase}{"
. ^, j6 }4 D: {* w* K8 Q - result += phrase_block_tag.select("span.phrase-title")[0].get_text().strip() + "}"5 r" y( s3 q1 @: ^
- return result + "\\end{pharse}\n"( k. @. u( x7 t9 a w% a6 O; G
, a* Q# E5 ?& p4 C$ O; D# `+ G8 A- def process_sense_body(self, tag):
) B7 o/ A( M+ w/ J; m - """; B; h% M$ n7 p! N
- <div class="pr phrase-block dphrase-block">...</div>; }' p3 a: Y. R4 F+ F* h) {
- <div class="def-block ddef_block">...</div>
! ] r+ t) d5 \( C) D6 p8 L - <div class="bb hax">...</div>
: ]+ ?# p& M2 E* o \( K - """& n- C) w0 l/ b; s: O
- result = ""0 }& }/ ^; K0 p- L) u+ b2 o. n
- for def_block in tag.select("div.def-block"):0 Z `0 f0 P+ ?2 A5 t2 u
- result += self.process_def_block(def_block)5 w* f8 m% C" J: t
- for phrase_block in tag.select("div.pharse-block"):2 L: h$ ^5 Q" q3 m, y
- result += self.process_phrase_block(phrase_block)
+ ]: T7 G6 i0 q - return result
( t/ r2 L: u: t: N7 a* W -
- Z* Y0 ^8 ?9 V7 U - def process_sense_tag(self, sense_tag):
' n) B: F6 N. l8 Y8 a1 z - """
: l. V1 _# G9 j4 g - <h3 class="dsense_h">...</h3>
) Y/ [) j+ { R0 [& D - <div class="sense-body dsense_b">...</div>
* T% e4 z9 c- ^8 h/ s- L - <div class="smartt daccord">...</div> # Smart Vocabulary7 Z j- P1 h3 O+ f: v$ V+ k% I
- <div class="bb hax">...</div>
* Q9 ^( k* ], ~( y; d - """
8 b$ y3 X* n) k3 q: O" s3 k! @ - result = ""8 p# {, z- Z! k" ]. M2 V. Z
- sense_head_tag = sense_tag.find("h3", {"class": "dsense_h"})
( O+ Q4 f' p# w$ Q6 `. n# G - if sense_head_tag:! M$ o/ C' d9 y9 b( c& O: q) Z3 G
- result += self.process_sense_head(sense_head_tag)
8 h- q2 X- O7 \4 ^1 } - for sense_body_tag in sense_tag.select("div.sense-body"):/ o! Q6 c: U3 @/ B! Y1 t. ^" r B
- result += self.process_sense_body(sense_body_tag)7 n- F4 |) R3 A/ r0 N4 p4 A
- for smart_vocabulary_tag in sense_tag.find_all("div", {"class": "smartt"}):1 V; g: s! k, v4 z/ f9 b4 f
- result += self.get_smart_vocabulary(smart_vocabulary_tag)6 j+ `, f6 b$ A/ s( t k0 m- i
- # |: K) n' l5 v Q' S1 L7 {8 Z5 Y
- return result
6 Y K, E/ S+ b2 F) R
& y7 o; Q. a- C" B2 L% K) Z" X- def process_part_of_speech_body(self, body_tag):- {( u( u5 A b3 g, Q
- """
# r, N/ ~- m% W p! I - <div class="pr dsense">...</div>
8 } R- I, e( f, L% [0 n5 l1 d, L - <div class="pr dsense">...</div>
: |, }9 a( e9 ~( ^/ L - """5 M6 p `; w* j- x+ R( \
- result = """ L( h( U7 L4 V H. O- ~
- for sense_tag in body_tag.select("div.dsense"):
, i( N: Y# m, F' M - result += self.process_sense_tag(sense_tag)
2 m2 f4 T8 ?. X- w - return result) c9 J. {6 `1 g* [' P9 W
-
! w8 d0 `* P- [2 l4 X' Z+ H6 v
) U7 C4 h9 y% c' I) k# S- if __name__ == "__main__":
/ O1 ]8 R# o2 C; `) S* K( ? - string = ""
; j* g" I) v* O0 ~ - CambridgeDictionaryScraper().start(cambridge_english_chinese_url)
' y9 u2 z/ w. |6 W. e, D! x3 @8 j - for row in conn.execute("SELECT entry, url, html FROM cambridge;"):
- t' S- b6 ^6 i9 n8 h - entry = row[0]
3 m7 Z( b' U1 o. y! D t - print(entry)
3 v3 I/ I8 M& J; D! P - url = row[1]
$ U4 p I/ ?+ W, g - html = row[2]0 @* r8 k+ y" m) ~) S7 C
- record = CambridgeDictionaryExtractor(entry, entry_html=html)
+ ^6 L) m; X7 {( E- }# F - record.extract()
4 D9 R- a& G% ?% M; N - string += record.result + "\n"
% c; a, y3 S, ^8 x7 J: V8 c5 W - #print(record.result)! N/ Q! f4 ~( v) U
: Z) Z/ f1 I% e3 x! |- with open("./final.tex", "w", encoding="utf-8") as f:
' q: W" ~6 c4 n2 g, { - try:# G2 ?, Q) R" u# N; U
- for char in ascii_lowercase:
9 Q% J) A4 ?' [+ ]: T$ ~; m" O - string = string.replace("\\begin{entry}{" + char, "\\section{" + char + "}\n\n\\begin{entry}{" + char, 1)
' P- p. b- o9 u2 s: g8 R - except:4 B( h4 ~: u% P3 B
- pass
% L% }) a& g% u3 a5 C' ^: ` - ; i, K# D- q7 ^$ c6 g2 Q' ?
- f.write(string.replace("", ""))8 \. ^" B5 k7 _* y g1 b
6 R" \# H2 ]. b# s; R: _
. |( `1 f8 @9 u/ g-
复制代码 |
|