TA的每日心情 | 开心 2019-8-21 08:44 |
---|
签到天数: 163 天 [LV.7]常住居民III
|
发表于 2018-11-16 21:27:49
|
显示全部楼层
- '''$ L: w1 k# B3 Y9 v
- Based on xmllarge.py4 ~$ c* k/ @% G* W" Z8 Y! y
- '''
' A% G8 h/ u0 E7 e$ B( s - # from pyquery import PyQuery as pq
+ R5 u( c5 r3 B2 r7 P* U - from pathlib import Path! f, [) L- i* N
! c, @ T8 |* Z5 q' F- ! b8 F! \1 y4 I @! F
- def xml_iter(file, tag):
% ^9 e/ N' Z' e9 S+ l z4 Y- Z( z' t - '''7 ~5 r( w( Q7 J" S
- Process huge xml files! f+ ?5 n: c) q; f2 A; U
- <tag> </tag> need to be in separate lines" u8 c% J( V( ^4 {' a* }
- # TODO: in the middle of lines: n- v+ L5 k; k; _; T
7 v9 B6 ?# r3 i s, L- :file: file path
$ F- }! W$ O- s2 I4 o; v - :tag: element to retrieve
# _0 D8 h* Q7 f0 L - '''. a0 c t. {. h4 X
- tagb1 = '<' + tag + '>': Y/ ?# ^% O& X( M; b. ~
- tagb1 = tagb1.encode()7 K! k# [9 b& H7 `
0 M% f# D! W( N% q
& T6 h; d. M, W- w: C, t+ Y- tagb2 = '<' + tag + ' '
0 w0 J& _7 g* e: G0 o- i - tagb2 = tagb2.encode()
& v$ f% b# A# L* M7 ^ - " j, ~0 j4 Z2 @3 e
- tagb3 = '</' + tag + '>'2 {; @% h1 g' I7 Z0 b$ \" r
- tagb3 = tagb3.encode()( h7 A2 ?- Z- t8 K* {, k# \, A
- $ x) O3 Y8 ~7 R: V8 Z6 D
- with open(file, 'rb') as inputfile:
' e$ r( W2 }3 _; T5 A' u* X4 Y4 i - append = False
, C+ R+ J* g, C - for line in inputfile:
7 _* _( h! g0 s1 ]' c - #~ if b'<tu>' in line or b'<tu ' in line:
7 F R! r4 i7 ]; `+ b - if tagb1 in line:
d! [: L2 J! S7 Z6 o - inputbuffer = line[line.index(tagb1):]
3 ]$ b. k9 c2 D' _ - append = True
4 {$ N+ R8 _2 J/ b3 ~9 z7 J9 {- t - elif tagb2 in line:
+ C$ N* ^) @/ ~: D l4 d) c6 d! {4 B - inputbuffer = line[line.index(tagb2):]5 e* z$ t9 w; g( f- h
- append = True
$ K2 T' c; S& Y# v8 U/ Y7 ~ - #~ elif b'</tu>' in line:
( X. e* q4 Y, J) x& D" B3 M! t - elif tagb3 in line:6 S7 L+ P* T8 I ~
- inputbuffer += line[:line.index(tagb3) + len(tagb3)]+ n* C4 c6 f: |3 D0 u, g1 l
- append = False
3 E; C- F" }- z+ Q2 Z - yield inputbuffer
& i) J( J9 K) p" e' b: N - #~ docitem = process_buffer(inputbuffer, id_num)5 y9 y" r7 k/ P: H* C: U% Y
- #~ print(id_num), `3 t( ^% w& X2 v4 |; j
- #~ id_num += 1
9 D6 G% Q3 K: u% x4 I - inputbuffer = b''/ p/ L$ p) ~1 y2 l; _% B
- elif append:6 A0 e7 L- e5 V: V0 X z4 x. x# l
- inputbuffer += line
复制代码
. Z$ G# I$ `6 b- ]& S! R; K$ b" f* u) w' e! O
这么多人找这东西?我过一阵打包发个小工具。
8 b5 Y/ \. y- f7 H! D! F. X8 p
8 p/ x8 j. e0 p( m+ ~- A上面的python3函数用法
: m2 ?" H! ]( i: S) E2 [! fresu = ''
! Z' M' }9 p# W4 W+ [$ Kfor elm in xml_iter(filename, 'tu'):; I# u* j+ i* C
resu += elm- X* W! X+ O5 p6 F ]5 }
# N: w0 c" T! g& z( @* h内存足迹极小……不管文件多大。 |
|