TA的每日心情 | 擦汗 2021-11-17 09:18 |
---|
签到天数: 79 天 [LV.6]常住居民II
|
发表于 2021-10-7 07:59:27
|
显示全部楼层
: `4 }4 M: i0 T
好的!! 麻烦了$ E4 g& |+ k% x. _9 }/ G4 i5 h
2 s/ c7 S. S4 y! G5 y
- import os9 Y: D1 x' m7 ~; f3 j* p$ r
- import glob3 k. h/ ?9 M9 E- E6 h8 A6 G! X
- import re* d$ a) {9 F7 ^' ^
- import time
3 j: x" P# Y; m! c H' k M - ' n5 W$ b" w( J' p
- #遍历所有子文件
# v. ?' P4 _; G8 S+ q - def getFileListAll(filePath):
" N$ {, @% |" p; j - filelist=[]
3 R% b" Q: R, S; ?5 k - for root, dirnames, filenames in os.walk(filePath):- g0 x- d3 c V. T1 S! {
- for filename in filenames:
% G4 L# x& Z9 h. M( _ - filelist.append(os.path.join(root,filename))5 P0 N0 Y" m4 H" V0 q( p' G
- #print(os.path.join(root,filename))
5 o( E9 j0 T7 ^ - return filelist
5 m1 C) V/ N1 o n( { {6 }& [ - . V5 m) J. o* g6 t* p
- #判断是否为txt文档" T1 R, J- |& E
- def isTxts(nameList):6 y* x1 x, K, T: w# i$ z R' C3 p
- pattern=r"^[^~$].+\.(txt)$"
8 j$ @; H6 v# k6 s, J" K+ a* g - patternObj=re.compile(pattern,re.I)
8 E( d. ^/ }/ I - notTxtList=[]
- d5 f, E& b; v* X. x - for fileName in nameList: 1 e/ S c# p" b% S i/ T
- if not patternObj.fullmatch(fileName): #判断是否为txt文件 % c6 C/ v0 E9 w7 i8 e# H. a A
- notTxtList.append(nameList)$ @% v) W& G6 y7 h1 h
- if notTxtList:
+ B9 q+ B& k$ w9 n- ~" W - print("存在非txt文件:{0:},请处理后再运行程序!".format(notTxtList))# ?+ a; k8 F, h. H
- return False
) O' H3 X1 W1 a - else:; w, u' Z7 u2 g; i
- return True 3 B" d* K: V2 A* v
( y- p9 Z G! e* I; Y+ X- s( r) G- #在指定的txt文档中查找关键字! T: r& |. B6 K0 _* R9 N4 b3 W
- def readTxt(fileName):#strkey:查找的关键字;fileName:文件路径7 A" n; A4 R( b0 i
- pattern=r"^[^~$].+\.(txt)$" * i' `/ j* O& z
- patternObj=re.compile(pattern,re.I)
: {9 T4 u) v) Q" y0 R% G! E# v - if patternObj.fullmatch(fileName): #判断是否为txt文件
! }5 H! ?3 |; M3 H( } p. }# \ - try: m' X3 e4 T% s9 _9 D; E
- f=open(fileName,"r",encoding="utf-8")
3 j# W; a! u1 l* n2 e& z - txt=f.read()2 f! G, O* U- y0 L. ~' e& m
- f.close()
7 y* Y5 M: m6 }. c8 b' ? - except:( i I4 q4 P7 D4 X8 P' Z Z6 p
- print("读取文档失败:{0:}".format(fileName))
5 i0 m3 H/ S/ v( O/ n - return ""
: n; U, q |- ^/ T - else: #无异常时,执行
5 Q; A) E3 L5 y" T' l. E6 S - return txt
" s. K* y$ y; P/ }% a: {/ J$ V) d; { - #finally: #无论是否异常,都执行
9 J9 R6 Q+ V. R) f" a* G* t0 [ - 8 \, R" ?, Y5 u3 K, J7 `0 s1 ^4 s/ \
- def writeTxt(txt,outPath):
& I$ X. A0 t, C1 w7 U! V4 w - path="\".join(outPath.split("\")[0:-1]); Q& K$ a9 A: c' u& r
- if not os.path.exists(path):$ [- k7 U V& q2 o7 r6 J! A
- os.mkdir(path) # g: ?: U ~) F' P/ W* c% R1 p$ W
- try:
2 o( q. ~4 Q0 R+ W3 {: k$ _/ k* r - f=open(outPath,"a",encoding="utf-8")( G8 M/ L1 |! v) o
- f.write(txt)
, v1 B( J% M8 K' z: H( A5 J - f.close()
- T0 ~$ O- x; @2 d& ~7 B9 G - return True % D/ w! e. `' U, w! U$ q! ^
- except:
6 Q/ B* m" @ R5 U6 B. e- j - print("写入文档失败:{0:}".format(outPath))! \4 x' m. c. Z" P2 q; x
- return False8 G" a. Y+ f3 `7 o7 d0 k2 D9 }* E
- #else: #无异常时,执行
4 o! c. y7 g7 x - # return txt9 M# j7 `* |/ M/ O+ q" v, s. o3 z
- #finally: #无论是否异常,都执行0 f1 ?$ \, J& O' v- K9 _( d5 ~
- 6 J' W/ p9 `2 J
- def getfileName(fileList):( u% k& W/ Y7 N! L0 O: w8 o
- '''
; K6 d! ?7 S5 t3 o - :param fileList:文件路径列表/ M2 j: I2 s& V
- :return nameList:文件名称列表有扩展名
& t9 W6 i |' n- R8 y( M/ [ - '''$ O% M% {, J/ z( d" `
- nameList=[]
1 m+ S! Q2 _; |: B7 L5 j: r - for fileName in fileList:* G# O, F* H+ s& R# z, Z
- name=fileName.split("\")[-1]#提取文件名
: i+ T" @' R$ ?0 O4 Z, d/ k9 { - nameList.append(name)" d4 f" P1 j4 S& ^. D( o$ `, F
- return nameList
; P0 V2 l. Z: I
% r! A2 L( u" G' {- #显示重复的文件名,如果有重复文件显示重复的文件名称,并返回False,否则返回True
3 M- F6 t) u0 i( I: E, v - def showDupFile(nameList):
6 J" Z% h6 v9 |6 R9 V+ K2 G+ | - '''! s+ r0 f" ~- _+ }; T3 [
- 判断是否有文件名相同的文件
6 i4 l; C5 {+ g; d- `2 p - :param fileList:文件名列表,包含绝对路径
9 y1 @2 U; ^9 T u! e$ e - :return: 如果有重复文件显示重复的文件名称,并返回False,否则返回True8 G5 o) g( M" p3 g# S1 A
- '''
, f; c4 k% w! X' _ - if nameList: #如果文件名列表不为空
( h! H. A* q! K2 Z# @" j9 o3 Y0 m - nameSet=set(nameList)
" M. i3 m. I p2 J6 |; F4 h - DupNameList=[]( }2 t3 ]: o1 D( O3 C
- for item in nameSet:
( N. [! ]' i. r; u - if nameList.count(item)>1:
8 c$ a- R( ~- J! ?: P6 |/ M8 D8 ~ - DupNameList.append(item)- A0 W% p( H5 A% T; O" v7 P
- if DupNameList:#保存重复文件名的列表4 e( ?% e- _! k
- for L in DupNameList:6 r+ m9 T: o& n S* y
- print("{0:}为重复的文件,请处理!".format(L))
; B: f) m% e! J/ @; W6 u - return False9 S; R7 n. ]" K: b
- else:( M, K9 r+ K/ H8 r4 G% J2 @5 X
- print("没有发现文件名相同的文件,处理中...")
% C- Z0 \# f2 E# U - return True% P: D% v! n% L: s6 |# @ o0 S! f; R
- % p8 h# p& U, b3 n! B8 w+ g0 H
- def nameListSort(nameList,rev=True):
9 F( ^4 {( Z: i% T - '''
9 B( V: r. O' y5 N& @: j) d0 R: b - 按照字符串长度排序, x/ N( Z6 |1 @- z" U
- :param fileList:文件名列表4 }3 N) S: \7 }: O3 A
- :param reverse:默认降序" p$ ~3 O; i# i& Z: Z- q
- :return newNameList:返回排序后的文件名新列表
, l3 ?7 g' O* e! C/ V& T - '''
. j9 b4 u" X+ n: }: B; @- I - newNameList = sorted(nameList,key = lambda i:len(i),reverse=rev) #按照字符串长度排序,降序
* F, f V0 A, W( S7 a, S - return newNameList
$ T7 f( x7 q& m9 F2 w
+ f2 H$ U! f" O% {- #添加超链接% \% ^% J {5 s- v2 b
- def formatTxtHref(nameList,txt):
& U' o& N/ C) s. ^* l - '''
- C3 K7 T% k: x3 J2 @% H/ w - func:在文本中为特定字符串添加超链接. R( [3 L! x E6 h( T! m) D
- :param hrefStr:待添加超链接的字符串
) Q5 P b8 E! x4 {& f. B - :param txt:文本字符串6 H! L1 x) I# _" V* R) Y! d2 }& z5 {
- :return txt:格式化的文本字符串
& k2 |- J! [' X) f, G - '''
~! P I' _& ]( \, u- S - print("\n开始循环添加超链接关键词辅助标记:")& B7 g' f' r+ S* B
- count=0
! f9 |: T1 q- p! `" K3 d - nameListDescend=nameListSort(nameList)8 R& M* L; `: `0 J e
- for nameD in nameListDescend:0 d. w/ T5 W0 @: n# g
- nameDSimple=nameD.replace(".txt","")
" O, E5 g: B E x' q; V- w - nameDSMarked="【@"+nameDSimple+"@】"4 N9 w: g3 X- k0 F* Z' Z
- txt=txt.replace(nameDSimple,nameDSMarked); k" v, z5 p9 @$ k9 u; Y
- count=count+1) Q6 f* y+ u9 e1 |$ F0 \
- print("\r已完成第{0: ^6}个".format(count),end="") - [7 m; Z. x- U Z$ l& r6 s& [2 O6 U
) x/ v" j4 N/ w1 V% X5 K: D- print("\n开始循环清理嵌套的冗余的超链接关键词辅助标记:")/ F- p1 J5 D% Q5 g" @* p( m5 q
- count=0: R L1 ?/ I$ x3 ?: j
- nameListAscend=nameListSort(nameList,rev=False)
# B4 w% d9 d8 t - for nameA in nameListAscend:! P& z# e& T! X5 h0 J$ i+ |) V# F B
- nameA=nameA.replace(".txt","")
5 |6 R* \- C" F' [5 Y$ A1 s - pattern=r"【@([^@】]*?)【@{0:}@】".format(nameA)
1 O2 }" q3 H8 X/ e0 e, M/ [ - patternObjTxt=re.compile(pattern)* u* u+ D9 J8 Y( d$ Z3 }
- toHrefStr="【@"+"\\1"+"muyubug"+nameA! s |0 m7 F* a+ H* P
- txt=patternObjTxt.sub(toHrefStr,txt,count=0) #替换为的字符串4 F: ~/ \$ ^8 i }7 }! K
-
5 ^8 H& d( f: e4 N - pattern=r"【@{0:}@】([^【@]*?)@】".format(nameA) ! r7 v" Z8 I$ G5 u+ b, k2 C. n
- patternObjTxt=re.compile(pattern)- C9 F1 |* b0 v/ G2 \4 b- e
- toHrefStr=nameA+"\\1"+"muyubug"+"@】"6 O& z" u* Q, q% c$ e+ Q5 D
- txt=patternObjTxt.sub(toHrefStr,txt,count=0) #替换为的字符串7 R3 R) n3 H0 ?2 ]5 ~+ K5 _
- txt=txt.replace("muyubug","")* J8 R0 @/ C9 G6 P/ Q
- count=count+1! d1 ]4 C" s9 H. d% Q7 \
- print("\r已完成第{0: ^6}个".format(count),end="")
% g: e' J" a- r- M, d V -
a. x }4 o& F9 D: x y# B8 C - print("\n开始添加超链接,") # W* r: Z* x4 ?' v6 G5 `
- pattern=r"【@([^【@]+?)@】"7 r1 _7 a+ o# I8 e" n
- patternObjTxt=re.compile(pattern)$ n* n, X& G7 B
- toHrefStr=r"<a href='entry://\1'>\1</a>" K. m* i8 M( x* l' m
- txt=patternObjTxt.sub(toHrefStr,txt,count=0) #替换为的字符串
: ~: F7 N8 R6 D - " L0 ], H( ?% U; H
- print("开始清理辅助标记,")
* p9 M: v D: f! h/ d X - txt=txt.replace("<@>","")
/ E3 z8 }5 ~/ o9 b. V5 |; p - return txt( I* J: B# |2 ~7 N5 l
- # \- u# `$ N$ G. p% o" L
- def formatText(fileName):0 {3 O y: x3 }5 S8 b2 H, g$ G
- '''8 G. \1 R9 L/ W. a6 w2 M/ j( ]
- func:清洗字符串,格式化; a% o$ N& F4 i! I7 P
- :param fileName:待处理文件- T( w/ p9 g. @- C
- return txt:处理后的字符串
2 ~7 Y# G e6 ^0 s" R* e - '''; K4 P* {6 g: Y* t% I1 G# M& Y
- #提取文件名6 ~* {) r) a5 I/ p! ^
- name=fileName.split("\")[-1].replace(".txt","")* m3 B9 F% j% ?$ F' F. |/ W
- #在文件名的每个字符键添加标记符
' T& Y$ S1 e1 S! E" J - pattern = re.compile('.{1,1}'): P# E: _6 ]% G4 f
- matchCharList=pattern.findall(name)
' F0 J, ?; T' u' [3 r" `- e - nameMaked="<@>".join(matchCharList)
# k: q1 v- _" Q* M# a/ p - #生成标题部分7 X3 ]3 Q! @ b# a# z
- txtTitle=nameMaked+"\n"
! g! ] V* V( y8 `) b - #读入文件/ d0 b- C) b' J8 S- i k
- txtContent=readTxt(fileName)
" a0 S: O" b1 w, I - #替换正文中的特殊字符$ W3 r5 v; g% c$ P
- strList=["\n","<br>"] ' m1 ?; K) i2 w8 L U6 t
- for ch in strList:! H; O) o; |2 J! w
- txtContent=txtContent.replace(ch,"<BR>")
: d$ O9 n( N4 a4 }3 G$ @$ B - #合成整个词条的内容
- w! K% ^$ w1 L - txt=txtTitle+txtContent+"<BR>\n</>"* t' c1 Z& D; y! `
- return txt
5 E. ~- y4 c3 R( L, c E+ d - 5 }+ } ^( s6 T$ F1 r
- def mdxFormat(path,outputPath):
2 s' W& E0 [' E- j - ''') a% _7 J { M4 A
- func:格式化文本
- l: A, z, ?% J- U3 m - :param path:待格式化文本的路径
8 e" R( v2 k3 B - :param outputPath:输出的路径和文件名) c4 Q3 M0 G) R% }8 L9 }! k
- ''', b" Z2 I4 S' z; j! ?! l) K/ `
- fileList=getFileListAll(path)#获取指定目录下的全部文件,包括子目录中的文件( T* z/ i* x& O) ~3 P
- nameList=getfileName(fileList)
' t' L3 W% o$ Z - if isTxts(nameList):5 ?1 T+ f( i ?
- if showDupFile(nameList):#如果没有重复文件/ \& ?% Y( [8 l6 l3 @4 ?# w D
- print("找到了{0:}个txt文档,".format(len(nameList)))
( @/ t8 s0 I/ f - ! Z2 M! P' }! W# ]
- print("开始格式化、合并文档:") 2 o0 q( N0 n7 c7 O) ]8 Y$ h: ~
- count=0#txt文档数量统计- M. u! n, s: @3 b/ x$ o ?/ K5 e
- txtList=[]#存储文件内容
+ d/ O5 k, M1 s4 f9 D9 B; h - for fileName in fileList:
+ ]& r$ b) t& q X! G - txt=formatText(fileName) #转换换行和<br>为<BR>
; @: ?4 G" J( _- l. [ - txtList.append(txt) p5 G J1 Z6 ^. v, O
- count+=1 c8 o! T, `7 E/ l' r5 X
- print("\r已完成第{0: ^6}个".format(count),end="") ) u# _+ ?" f1 o: E' L, Y# n6 l. T2 L
- txt="\n".join(txtList)
- O- g' L7 r! v1 t- ` -
$ _# Q. N3 N! h! c0 r6 i ] - #添加超链接
6 B# r3 S* V1 m; ? - txt=formatTxtHref(nameList,txt)
- p4 o) g7 c( N2 G) c - print("开始写入文本,")
, P F/ D) ]; z% [: R - if writeTxt(txt,outputPath):) [( ^6 K$ w& U) _8 ?7 i5 O- y+ H9 o
- print("文件合并输出成功!"); C/ P* Y% ^4 c- ?( j, j* V
- else:$ Y+ K# k% S% _6 `% T4 u
- print("Error:Merge!")
! o0 z7 n; ^( T- t% F - return 0 R9 U+ V8 G2 X
- ! n, C9 Q. s( G8 m# K+ m, D
- def main():" T/ U, r m. {* e' d
- timeStart = time.time()+ @% S7 H% A4 p* w3 M0 G
- 0 `& o$ T$ }4 w5 d3 r: I5 U
8 Z$ l- H3 k: p
# {& Y F7 c* [5 n$ }- path=r"/Users/vivian/Downloads/mdxFormat_upload_V3.5/test"
5 ?6 k, M8 L( ^/ K# ? -
" b; v; D+ ~! ^9 I9 K- C9 b& _ - outputPath=r"/Users/vivian/Downloads/mdxFormat_upload_V3.5/output/demo.txt"2 N* I( Z+ n D/ r5 S
- & f9 B; B; [, M" z' \
- mdxFormat(path,outputPath)
* j+ @- Y, l1 G6 ^* l) j - ! u3 W5 X! `* k' U, L! k
- timeEnd = time.time()# t+ j+ }; ^- [5 h2 `0 N
- print("程序运行了%d秒"%(timeEnd-timeStart))
- n+ F1 l$ _' V+ f: Y
! e9 V. L/ \0 h O- if __name__ == '__main__':0 c. d# J" [% r. L! Z, j6 ]# \
- main()
( B/ l' G' t* p" y( e/ T& q' p+ ^
复制代码 |
|