TA的每日心情 | 开心 2019-8-21 08:44 |
---|
签到天数: 163 天 [LV.7]常住居民III
|
发表于 2018-11-16 21:27:49
|
显示全部楼层
- '''/ ^, Z) U K) |6 h
- Based on xmllarge.py9 D% A# g# t; n K, R6 I: I" z2 t: Z
- '''; Q. V: Z3 H. d0 ~8 p. ~8 A
- # from pyquery import PyQuery as pq
" y4 F# E) y. C, @4 Z - from pathlib import Path
. g$ F' A0 a" q; V. @2 T3 R7 H& ?& q - 5 s: S) |+ S' e9 g) c1 A" H
- ; b! s2 _. _+ [/ o
- def xml_iter(file, tag):
6 `7 @6 q3 ], C" y$ I4 }& [ - '''3 o/ h' `$ Q0 S
- Process huge xml files
3 f+ `0 k; J! X4 S' w q" a - <tag> </tag> need to be in separate lines- h$ a+ q/ n6 Y9 J8 o& t
- # TODO: in the middle of lines
$ N' f# N) k) D) x0 H9 v; E! N
" Z2 N* s q# a' d' }& w- :file: file path
) V6 _# L( m0 } G b2 U% E, | - :tag: element to retrieve
# A. }( P5 T3 C+ K" I - '''
. M9 c, S: S( ^ - tagb1 = '<' + tag + '>'( ~) h: R$ ?6 k6 `7 W) ~- ]6 B
- tagb1 = tagb1.encode()) o# L! L# S6 e5 |5 _
- 6 {7 Y7 Z5 Q: g2 v
+ l8 r R1 l7 @" u* x5 @7 M7 n7 A- tagb2 = '<' + tag + ' '
5 I& H p) c ^( }. o- i( I% Q - tagb2 = tagb2.encode()' _$ T& B$ b& D- @
- . t/ \" f- V; k' N' |
- tagb3 = '</' + tag + '>'9 G7 U9 R# @/ e4 n
- tagb3 = tagb3.encode()5 r# t6 E# q9 G
+ |" f8 w( z# z8 e% T/ {- with open(file, 'rb') as inputfile:
* j7 k7 b( _9 L - append = False
s7 ^8 N/ R" C5 x6 Y, f - for line in inputfile:, T/ p/ B$ `0 D. f d4 C- v
- #~ if b'<tu>' in line or b'<tu ' in line:. j* H5 ]: l. s# n" L# Z
- if tagb1 in line:) X! Y5 C+ H3 Z9 v" T0 W: A
- inputbuffer = line[line.index(tagb1):]4 [/ p0 H1 F1 r% r4 D1 o! w' `, H
- append = True, `6 U5 n; X4 O1 M
- elif tagb2 in line:
8 M) C% e2 b. Y _1 X4 B* }/ ] - inputbuffer = line[line.index(tagb2):]! n( p- q2 H; ]0 M
- append = True
1 c+ v; |7 |' I9 f - #~ elif b'</tu>' in line:; M5 W& ]( d) L4 P3 {# D
- elif tagb3 in line:
- o5 N( V1 s( f" W* v - inputbuffer += line[:line.index(tagb3) + len(tagb3)]0 h `6 p7 a/ A7 n2 a* B3 ~
- append = False9 b# [2 g8 L8 \
- yield inputbuffer! X' W" ]- x [, d) A
- #~ docitem = process_buffer(inputbuffer, id_num)
) M! C3 Q2 X9 j5 I: [' W - #~ print(id_num)
' G, K7 ~# |2 J* _& k- [ - #~ id_num += 1
* c* ^/ `$ H) V6 z( U - inputbuffer = b''
$ t) i8 _/ P/ w/ E - elif append:+ m% j0 B' |' l. w% X7 ~) F5 `
- inputbuffer += line
复制代码 , }" q, G5 J2 C4 r8 u" N1 w
- I! G7 \% ^' d. p& g这么多人找这东西?我过一阵打包发个小工具。
* A& j$ M7 k& q; k" {, w! s5 S' l& `$ S$ Q* F2 s2 f" U
上面的python3函数用法
: o% u4 [6 f. O q3 Lresu = ''; n: K& y9 w) E
for elm in xml_iter(filename, 'tu'):
, v- e7 m' j) J) ?$ q resu += elm e* X( Z. _1 q" h! j2 a( ^3 o
- M0 d5 b7 a2 M; i% q$ e; j7 l/ L/ c内存足迹极小……不管文件多大。 |
|