TA的每日心情 | 开心 2019-8-21 08:44 |
---|
签到天数: 163 天 [LV.7]常住居民III
|
发表于 2018-11-16 21:27:49
|
显示全部楼层
- '''9 C1 ]7 R1 g+ `
- Based on xmllarge.py! x. l1 G/ T9 w0 A1 P+ u# ^
- '''
4 o. u3 q _$ M/ z2 i% U - # from pyquery import PyQuery as pq
' l- G; ]8 s& |" S+ n1 A, ] - from pathlib import Path
1 |- D: C& L, h# w3 }4 U - ' V; q3 o8 Q5 N2 }0 \; ?6 P! J
( f- J- k! K6 [) n& G- def xml_iter(file, tag):
% F- k1 @+ m) y* g5 R7 n - '''
, H. |1 W6 R9 p- ~( K - Process huge xml files
6 C# b8 C. J. j# X5 N2 S - <tag> </tag> need to be in separate lines
" B. L" l: F- i$ y; D! n6 }; B - # TODO: in the middle of lines
2 }) I. f. e, @$ S# o
' b: d9 |' _3 Q8 }) k, e- :file: file path8 A, v* X8 N# z- U9 }& c6 c. X
- :tag: element to retrieve
, w& v' R8 A% v1 l6 i$ E - '''
; s$ z$ H$ W2 g9 z - tagb1 = '<' + tag + '>'; C# N9 c7 [5 V2 `
- tagb1 = tagb1.encode()
- @8 ^6 _: P7 A+ E6 h - # m6 [( ^5 n/ R0 }! I4 J9 S
. E# y' c) q, m5 `+ W- N1 {- tagb2 = '<' + tag + ' '
# m( L% Z$ Q4 r- b4 O% s5 r: h - tagb2 = tagb2.encode()
; r/ _- b" _: b* [ Y# Z4 [5 `3 u6 E' ^ - 9 G% p6 q; L, P: ?8 k7 e
- tagb3 = '</' + tag + '>'
/ z% H9 ]& p ?2 J# [ - tagb3 = tagb3.encode()5 I B5 i8 C1 V: @- _1 X' U
- ! F) L- t. m( q q3 Y |4 |: N9 w
- with open(file, 'rb') as inputfile:5 j; ]9 P9 i0 {. i. |( |. X7 C
- append = False
5 m7 z; ?* `; r; ^$ L# q |& ` - for line in inputfile:( G4 F# W! A+ i7 [/ I+ Z
- #~ if b'<tu>' in line or b'<tu ' in line:
4 ]* f1 D7 j( b1 D3 o9 t - if tagb1 in line:
4 ? v: S5 E8 K1 Q5 Y0 ]% A; x/ J - inputbuffer = line[line.index(tagb1):]
, L0 g( |4 e r2 z- \0 P' X - append = True) i* l/ T% p6 `, s/ [
- elif tagb2 in line:
. G9 b2 ~% J, H% h: `, U - inputbuffer = line[line.index(tagb2):]
: H, Z% Q9 A1 j8 q- H - append = True/ O( c! T% N+ H. e R$ \
- #~ elif b'</tu>' in line:
6 L+ H" W7 s! \, m2 J+ Y5 ~! S - elif tagb3 in line:0 w0 o' o$ h" @& y/ h
- inputbuffer += line[:line.index(tagb3) + len(tagb3)], n$ ^5 x5 n" T% U4 s
- append = False5 Q8 H; z1 a& v/ J
- yield inputbuffer8 N3 i' v+ ^6 p; C' _
- #~ docitem = process_buffer(inputbuffer, id_num)
/ y! _+ S/ o1 p" e7 h7 j - #~ print(id_num)) k6 q `5 C7 _; i1 |
- #~ id_num += 1& t0 O# V- g: E+ B9 ]; ]
- inputbuffer = b''
" h3 ]5 Q$ _+ _; ^ - elif append:- j/ a. I8 |% |
- inputbuffer += line
复制代码 ; |" c+ b9 M. x& ]) V$ }8 d
. F5 H, o1 S6 C$ f这么多人找这东西?我过一阵打包发个小工具。1 \# c2 N4 |1 y! H; ]' `
) G& p1 ~% c6 Y
上面的python3函数用法0 V+ O) Z% b/ T
resu = ''
8 I5 f6 g. z) \' b2 G6 l' Hfor elm in xml_iter(filename, 'tu'):% |2 j' a1 z! v- c
resu += elm
5 [, E M, [& e6 S" q
t- }( N# o8 s7 ?内存足迹极小……不管文件多大。 |
|