TA的每日心情 | 开心 2019-8-21 08:44 |
---|
签到天数: 163 天 [LV.7]常住居民III
|
发表于 2018-11-16 21:27:49
|
显示全部楼层
- '''
, k" P# X3 k1 T: h1 P - Based on xmllarge.py
& s- _ N2 u" g$ x - '''
! d/ J2 O W1 {6 y9 S - # from pyquery import PyQuery as pq
% S" _( v; A% V9 `( M. s - from pathlib import Path
6 R8 F% s2 j k$ P3 t3 v5 p - $ R3 x6 S7 G! Y' e
7 m+ N8 k% H4 }! u3 Y# d& A# W: h- def xml_iter(file, tag):
1 _9 Z: Q) ~( G- t; M9 e* V' U - '''; k d" d; T4 I `: [0 w
- Process huge xml files
; B1 x; s+ f5 `& ]0 ] - <tag> </tag> need to be in separate lines7 p. h, H4 \$ }: H3 [; q; Z; X
- # TODO: in the middle of lines1 e Q( q' x3 d N/ {' B
6 k7 \5 J4 x# v n# @1 O1 y" P# H- :file: file path# W4 |$ a% |4 Y. y" U+ j
- :tag: element to retrieve
# V- s" f0 u1 w, W/ e# F5 X' w - '''
j& ~4 [# L! e6 n! A5 ~' J - tagb1 = '<' + tag + '>'% y& D5 I# w2 M! I% h G0 A: T, `8 B: [
- tagb1 = tagb1.encode()) {0 f6 o; K, T9 o; E4 ]' V8 j0 F
- - F3 L2 E, C* a' r1 t% s8 {) o. Q
- l' r. ?: _. \- tagb2 = '<' + tag + ' '/ u4 p2 @: b6 @: X
- tagb2 = tagb2.encode()9 \7 ~ F9 o+ E
( P5 C! B k. {: N5 m! Z- tagb3 = '</' + tag + '>'
8 w; X' |# w' m1 n; p6 X+ I - tagb3 = tagb3.encode(). H* n p$ L0 J& {: C9 \5 O
- - c) z* {7 q8 M: }' u o
- with open(file, 'rb') as inputfile:& t; [! v, ^% M+ A
- append = False
8 w5 m3 U$ a, s! Q7 u0 Y4 R - for line in inputfile:2 ^4 O5 o4 J: T3 ~& B, o/ B
- #~ if b'<tu>' in line or b'<tu ' in line:
- H+ J F6 W' q# K, W. S5 w - if tagb1 in line:
& w4 f/ }! Y% C: v - inputbuffer = line[line.index(tagb1):]
7 x( t) Q" R T2 v& W- J" q' Z* o - append = True' p6 I: z, \1 q) k5 ], F/ H. R
- elif tagb2 in line:
* j& s5 [3 |! k k6 I - inputbuffer = line[line.index(tagb2):]9 G. i& }: a% ~. }, \
- append = True c; `+ `" y0 v ` R
- #~ elif b'</tu>' in line:3 h: Q) f. n( _2 E
- elif tagb3 in line:
. t2 F# A% R; T4 c: I) y - inputbuffer += line[:line.index(tagb3) + len(tagb3)]
( u6 c1 B* I& k" |, G - append = False0 w/ z1 a( ?; u! f1 @8 e8 K, ?0 _
- yield inputbuffer! q2 R; a4 E9 I W7 z1 N
- #~ docitem = process_buffer(inputbuffer, id_num)
9 @7 s! p3 U5 y: T ]+ q) ^, D8 F/ p - #~ print(id_num)
, c: V9 }. h6 u4 P# m; M - #~ id_num += 1& U$ F8 y a8 ]- o* _7 w
- inputbuffer = b''
( l/ X* `) j/ q, P* u - elif append: W# V( P5 N3 J, W, I. h: g+ Z
- inputbuffer += line
复制代码
9 e o/ j+ ~' B6 k
+ S2 T+ ?* W/ f+ Y这么多人找这东西?我过一阵打包发个小工具。
9 M# X% p( S3 R2 z. s
, e1 M& h' G, B5 x5 _0 t( F上面的python3函数用法
Y7 U5 g: x$ E% D, jresu = ''
3 `0 u( V1 X8 e' A+ nfor elm in xml_iter(filename, 'tu'): C+ O; b* k: l
resu += elm
1 [* D* w- ]6 Z; p0 L; V6 E
/ n1 N( q' F$ W: W9 j7 H内存足迹极小……不管文件多大。 |
|