TA的每日心情 | 开心 2019-8-21 08:44 |
---|
签到天数: 163 天 [LV.7]常住居民III
|
发表于 2018-11-16 21:27:49
|
显示全部楼层
- '''
$ F3 _6 T- J& o7 m - Based on xmllarge.py) ]6 K# { P+ @5 j7 ?9 g. H! Z
- '''
2 v6 Q5 Z$ E& ]2 w% w% w - # from pyquery import PyQuery as pq$ T: `- }) ^+ N F0 \
- from pathlib import Path
2 K& Z+ {6 D- f7 O - ! \% _/ i# y, f$ w/ o3 S+ T
- & @- ?! U; n* x6 n0 u" T
- def xml_iter(file, tag):+ L( v. v+ o0 K& v s' f
- '''7 D: D- U3 {) _) } J( \" P
- Process huge xml files% t Q+ {9 N, ~, L8 G
- <tag> </tag> need to be in separate lines
: h# k7 e9 \' w% L! f" K - # TODO: in the middle of lines
8 F i. B) m' y
" I+ G( R9 B" S- G% X- :file: file path3 d% h/ I6 A5 V/ K: T. y, F
- :tag: element to retrieve1 n- I7 h8 V0 m- E, B
- '''2 t( S* M* u) y# W7 S2 M* D: t
- tagb1 = '<' + tag + '>'; k+ ?8 }0 V% ^
- tagb1 = tagb1.encode()
7 k: x* ]9 x& {6 \ D
1 ^* j; a3 f$ J# k- 7 Y C& T, u% k$ N/ Y9 @
- tagb2 = '<' + tag + ' '/ B$ p% F _2 q0 ^: x; @; c
- tagb2 = tagb2.encode()
5 ]$ F7 I- h6 d! G - & H' v' A6 ?1 _9 I# F
- tagb3 = '</' + tag + '>'
8 j. b9 P [- [9 }0 z - tagb3 = tagb3.encode()
0 \3 {* m1 q$ O& l* q
- J5 L% Z6 R; t& ~, l) Y5 Y- with open(file, 'rb') as inputfile:
; A" B; S* x$ e. M* ~5 n4 {9 b! K3 [ - append = False* b+ Q# a4 {4 ?3 e: v6 v; F
- for line in inputfile:
. m( X( U; ~5 e4 J( H# b - #~ if b'<tu>' in line or b'<tu ' in line:% p) V) p/ Z a- h z8 U
- if tagb1 in line:* T* { U$ X0 {# {
- inputbuffer = line[line.index(tagb1):]2 ~$ I8 q- L. z0 {/ j( c- x
- append = True3 Z4 t' x. T- b& @/ R( Y
- elif tagb2 in line:. A, n2 }; ~+ b9 q
- inputbuffer = line[line.index(tagb2):]2 `9 i$ n4 G3 Z1 s9 ]
- append = True" G y9 N) [; l: M; ~
- #~ elif b'</tu>' in line:0 Z4 D9 v6 S+ `* n6 r
- elif tagb3 in line:* F+ k$ F8 T6 G+ w" s! P
- inputbuffer += line[:line.index(tagb3) + len(tagb3)]. L3 o% a) u& k2 n7 l( T% U+ W; F
- append = False* I. m2 K# X& p8 x" I2 C3 H
- yield inputbuffer2 p3 L/ u1 N6 a) d+ [& N; I' S
- #~ docitem = process_buffer(inputbuffer, id_num). X. u1 D( Y! B1 P% C; _& Z2 J0 O
- #~ print(id_num)
3 {: l/ q, \6 v - #~ id_num += 1: a% t" v5 x. A8 |% T
- inputbuffer = b''+ a( L& E- ^- E" m' E+ d+ m" q
- elif append:1 x* i& E+ \( c1 E2 g5 e7 F8 r
- inputbuffer += line
复制代码
8 i% R; n r+ _6 w6 S# ~5 K& l* s7 ^+ B& t) M H5 t( t7 @
这么多人找这东西?我过一阵打包发个小工具。1 i' |* T/ Q: S! O1 e
4 j- h1 K s* H0 A上面的python3函数用法
2 o. A, D1 _( s; ]9 l/ h/ k) Yresu = ''6 s3 h! ~1 l! A" G
for elm in xml_iter(filename, 'tu'):
* M: o' Y9 v7 a" a resu += elm' a9 ?5 g, z( r6 W; f0 L
* {3 U* L$ M# d
内存足迹极小……不管文件多大。 |
|