TA的每日心情 | 擦汗 2021-11-17 09:18 |
---|
签到天数: 79 天 [LV.6]常住居民II
|
发表于 2021-10-7 07:59:27
|
显示全部楼层
& @5 D5 b8 `8 u7 j3 M0 ^$ a/ I4 L好的!! 麻烦了
: V" S, C# E; J- W& \' b) J& G8 L! V# L& b
- import os
9 I5 r( T0 g) G1 Y; q f - import glob- ^. \; x& k' X7 l4 \& `
- import re9 x: V7 A+ f! i, j% E/ w( c/ P
- import time
- a% T1 C* V1 a3 i( T1 m6 V - 8 N8 X9 g; a" r" m1 \
- #遍历所有子文件! Y1 Z# ~. R2 z1 ]2 I
- def getFileListAll(filePath):5 |) a5 H4 f! R" l6 p4 \
- filelist=[]
2 @0 r: F4 ^2 I: W - for root, dirnames, filenames in os.walk(filePath):
; W5 O# V. }7 F5 O- M" P$ | - for filename in filenames:
# l. V8 N% v( T6 r1 R; k+ U - filelist.append(os.path.join(root,filename))
1 X. L* u, X1 s a$ D - #print(os.path.join(root,filename))# r' X3 b" m9 R# g3 u2 M, N
- return filelist
: Z/ m( f0 L3 B9 f; O* M8 Y2 ?; ? - ) f7 P% e& H& n: G0 h, ~6 k. D
- #判断是否为txt文档6 w0 y# ]% \" J- B( U5 R/ q4 a
- def isTxts(nameList):8 v6 x0 ] M: `! A( ~$ X
- pattern=r"^[^~$].+\.(txt)$"
1 F3 j N3 V& {9 I3 X3 M1 ? - patternObj=re.compile(pattern,re.I)" ?" } \) S& A. e& P# }
- notTxtList=[]# b2 \6 ]/ a. O( ^# V) d3 M
- for fileName in nameList:
, M* m( l- [6 t - if not patternObj.fullmatch(fileName): #判断是否为txt文件
# G! N [# j. g2 z& i - notTxtList.append(nameList): Q9 `$ h/ h; D/ N" |* [
- if notTxtList:
! e0 v# E7 `+ b; O, E6 R( G/ U3 @ - print("存在非txt文件:{0:},请处理后再运行程序!".format(notTxtList))8 j) N3 a( u+ d
- return False% t" P2 j0 l j+ ?, ]$ I9 o: B. L+ R
- else:
* s ?" R; R, v1 P+ C - return True
& C5 T% c1 n+ g: l1 }0 F - ) L; ^$ C: J5 o2 K5 t# a
- #在指定的txt文档中查找关键字6 |* y) v5 z3 A* N5 m
- def readTxt(fileName):#strkey:查找的关键字;fileName:文件路径, [( `' k$ ~: T( c }" c
- pattern=r"^[^~$].+\.(txt)$"
: }) _# a, X6 z; y# s# Z - patternObj=re.compile(pattern,re.I)
0 X% E" \& E* r - if patternObj.fullmatch(fileName): #判断是否为txt文件
/ c& E! P: i. y, {# i - try:
6 c; i: E6 ~ L* g8 r3 p - f=open(fileName,"r",encoding="utf-8")
+ Y- E7 }4 `# O/ s! Y) j* | - txt=f.read()9 q( e& m2 V4 `9 e
- f.close()
$ L! A9 H* S8 [9 j0 P% D* } - except:+ y# y0 K0 f5 n6 n8 y h* ?
- print("读取文档失败:{0:}".format(fileName))1 c; P. @. I! s+ ~7 o
- return "" n7 |2 t: ]9 c/ l
- else: #无异常时,执行. w. B. o2 H4 Y7 |
- return txt; B, L$ _9 u) ~) z3 u' v+ y
- #finally: #无论是否异常,都执行
8 j1 o8 V2 N$ Y! M4 X. Q0 @* e/ z
9 m' q" B5 }* R$ [+ Y1 @9 T7 a- def writeTxt(txt,outPath):; a% Y0 K& R. Z/ k
- path="\".join(outPath.split("\")[0:-1])# ~: `& O1 m7 O/ n0 l+ u2 Z
- if not os.path.exists(path):
+ B" ?% \6 y$ \, Q# i- \: }; t9 z0 g0 P - os.mkdir(path)
u+ }4 O3 y) g+ Y( o1 i9 G; c2 x. a0 F - try:% V! t# b( o9 ^: a
- f=open(outPath,"a",encoding="utf-8")
( u' P8 n2 j# U; e/ k* T) T* s8 j - f.write(txt)" s, e- l( i8 I C, i( I+ ~
- f.close()
" l) ~6 S( y# e5 m: d* J - return True . c0 p3 E1 R7 t' a% u7 p
- except:% E5 V- b# o- Z2 r% v" D
- print("写入文档失败:{0:}".format(outPath))
" Y. A* p- J1 i& X& x' h1 l - return False
- R, W/ J: ^. G" F2 Y' t* k - #else: #无异常时,执行
* ]8 z4 \) q( e- Y' e# e: M - # return txt1 Q3 R) A7 W; J. U, [0 S' n# E# L ?
- #finally: #无论是否异常,都执行
$ l! F$ F. P( Q. H9 t - 6 A9 e) V$ f. Q6 H1 E% @
- def getfileName(fileList):
, I7 B c9 t+ k% h& [( a - '''
2 R1 v7 X) r, Z3 m - :param fileList:文件路径列表
) F5 @ V; C [7 G8 r3 c8 r1 f - :return nameList:文件名称列表有扩展名, f. \7 V5 p8 g/ n$ X7 z
- '''2 H3 W, ~0 f8 ?/ ?% |* [8 X! k
- nameList=[]5 W) u# R! R, I, h3 L1 ~; t
- for fileName in fileList:$ [: R: x2 L: x; ]0 z% l
- name=fileName.split("\")[-1]#提取文件名
" U: m% ]& [) y9 A1 r9 h- E - nameList.append(name)
) q5 h! V- j* @) ^" l4 e- f0 c - return nameList9 g& v6 D6 h9 c4 h
- % j4 Y6 H% f+ y" P5 ~* t/ Q" W
- #显示重复的文件名,如果有重复文件显示重复的文件名称,并返回False,否则返回True+ {7 f' T$ e5 x5 k* ~
- def showDupFile(nameList):. m. e9 [* W- w
- '''* P9 i7 x0 c* C( Q
- 判断是否有文件名相同的文件
) u/ A" t4 w7 ]+ }- Z( I2 I. W - :param fileList:文件名列表,包含绝对路径( j: C# l+ U2 |$ T
- :return: 如果有重复文件显示重复的文件名称,并返回False,否则返回True: K) p2 k9 T; G+ g
- '''
; a# ]5 U8 J- Q* a( K: c+ E - if nameList: #如果文件名列表不为空
7 a5 l# o# ?: ` - nameSet=set(nameList): G1 R4 U1 B) j2 P F0 I8 W
- DupNameList=[]! z( Y s. w, ?4 |
- for item in nameSet:
1 S% V# [" s9 C' L- f- O - if nameList.count(item)>1:3 |9 x' T* P4 R' o9 e9 W+ z1 ^- V- M
- DupNameList.append(item)
1 O" }" N% c9 e7 j* |$ ]' S; J - if DupNameList:#保存重复文件名的列表* k8 ?' ]1 p; X* _
- for L in DupNameList:
% g% g( z. w- f! b - print("{0:}为重复的文件,请处理!".format(L)) ; T! g7 Y. H1 G. @* s
- return False
3 \" A' i! X& s% S" ] - else:
. ~7 d8 `8 A* M1 _& e- D2 h# n - print("没有发现文件名相同的文件,处理中...") 6 P2 \6 [' J+ v8 F+ N: v; c
- return True
& k. m5 j+ t6 s6 a2 ?" k& N o - 3 g( l' H8 I% z2 P* h$ |
- def nameListSort(nameList,rev=True):
! s& v9 f1 W; e7 @ - '''+ m s& v" o/ q( w
- 按照字符串长度排序
+ S* t6 d4 `" Q; o' v9 w5 r - :param fileList:文件名列表
$ m; w2 w8 C$ x# H/ X) d - :param reverse:默认降序' m- }) d' T& w% ]$ e9 V X
- :return newNameList:返回排序后的文件名新列表
( c m. R& H( e3 i7 D - ''': @3 [4 Y, k. K* H) H& B
- newNameList = sorted(nameList,key = lambda i:len(i),reverse=rev) #按照字符串长度排序,降序
8 j: b/ M4 h2 O# n - return newNameList , y( k$ f) p. q; z( v
4 |4 I( E% F5 j& x' C2 j9 r; e- #添加超链接
! N) e* O9 Y; J/ C- M; z8 O - def formatTxtHref(nameList,txt):
6 ~: L$ L J. F* r0 e0 ^% v5 f - '''
: H1 t/ m9 r) }! E' g+ O/ y+ z8 | - func:在文本中为特定字符串添加超链接" f9 E$ `2 N/ M& R( X
- :param hrefStr:待添加超链接的字符串0 J$ ~" U+ N) W; [( J
- :param txt:文本字符串; D, ~" r0 v* N- X$ c' N5 z
- :return txt:格式化的文本字符串1 ]* i A% M, `# e& Z8 _9 S
- '''
' t- Q% c1 I! f+ T# L* R - print("\n开始循环添加超链接关键词辅助标记:")) e, z% d: V8 `: H G
- count=0! y, N. _: v, s J$ ?! W/ T% P. M& Y
- nameListDescend=nameListSort(nameList)
) z) h/ o8 J/ h& g+ K9 \* T - for nameD in nameListDescend:
, r* h6 B9 C) O" B8 B/ U% X$ H - nameDSimple=nameD.replace(".txt","")" F+ D7 B6 L5 B& z; i$ u3 `
- nameDSMarked="【@"+nameDSimple+"@】". B, h0 [" a$ n6 ?- ?( ?
- txt=txt.replace(nameDSimple,nameDSMarked)* X/ p6 A5 p* V; H$ _
- count=count+1' r" I5 U9 `2 e2 n! w
- print("\r已完成第{0: ^6}个".format(count),end="") 1 ~% U$ I* D7 t7 J# X
6 }; z! x7 F, ^- r' V- print("\n开始循环清理嵌套的冗余的超链接关键词辅助标记:"); I0 n/ v g: X
- count=0- P y: h" J7 {. m9 l
- nameListAscend=nameListSort(nameList,rev=False)
8 f P, j$ g+ v+ ?# `0 C9 P - for nameA in nameListAscend:
6 s+ |+ ~+ L( e M( g - nameA=nameA.replace(".txt",""). B: m0 ]' ?/ Q
- pattern=r"【@([^@】]*?)【@{0:}@】".format(nameA) / P6 `# r/ N: d, \: }. j
- patternObjTxt=re.compile(pattern)
5 I# d- U( k% \* w2 S, m8 v - toHrefStr="【@"+"\\1"+"muyubug"+nameA9 Q$ |& |7 x2 N
- txt=patternObjTxt.sub(toHrefStr,txt,count=0) #替换为的字符串# B3 p% @% M9 C P) a* J l" T
- . u. U& r" ?8 L
- pattern=r"【@{0:}@】([^【@]*?)@】".format(nameA)
1 l! }$ z/ \9 S! s& c$ o2 j. B - patternObjTxt=re.compile(pattern)
% T8 \3 i" B- l3 L - toHrefStr=nameA+"\\1"+"muyubug"+"@】"
7 F; ]( f: A# f - txt=patternObjTxt.sub(toHrefStr,txt,count=0) #替换为的字符串
7 z* J' ` r# \: n/ } - txt=txt.replace("muyubug","")3 s: B: o: }$ ]1 Q5 f% Z
- count=count+14 Z+ g& ~& V2 c3 V
- print("\r已完成第{0: ^6}个".format(count),end="")4 s( Y9 O* d2 I$ p9 d
- * |3 r3 L9 `: ~& U; B% j2 G
- print("\n开始添加超链接,") " p; `! j, G% V* h/ {" ~
- pattern=r"【@([^【@]+?)@】"' C" X0 D2 L4 Z# x9 Q; W d
- patternObjTxt=re.compile(pattern)& n `! W" K- r
- toHrefStr=r"<a href='entry://\1'>\1</a>"0 L1 w2 D9 Q2 P- ?4 X
- txt=patternObjTxt.sub(toHrefStr,txt,count=0) #替换为的字符串. ] B4 G! D9 g' w! x# A) u4 N0 _
- 8 l; e9 {8 W5 }- ]
- print("开始清理辅助标记,") - F; t" { d1 e# E+ L
- txt=txt.replace("<@>","")
3 @0 L, @( n/ t, g - return txt
7 e# y+ h0 G3 V Q, c
. q# I9 k4 S7 l3 n9 r- def formatText(fileName):
7 U: ]& [4 G0 N4 ^ - '''
9 M! h1 {1 C: v) F- g - func:清洗字符串,格式化
5 |; G" n% H; [) |; E- Y) W - :param fileName:待处理文件* O& o5 N. a" B# y) Z& g8 X! w5 a$ [# \
- return txt:处理后的字符串 |5 {* {: Q- C$ B# O. F
- '''
- L; Q6 l& U7 C5 u. y$ R - #提取文件名& W3 A* L' M% a W* I1 L4 A+ I
- name=fileName.split("\")[-1].replace(".txt","")
! R7 O: n7 w2 |! d7 Y, }4 w - #在文件名的每个字符键添加标记符
8 n2 R8 g; H! X" D% h# Y" ^ - pattern = re.compile('.{1,1}')
' L% T0 {8 p1 N! K4 O - matchCharList=pattern.findall(name), g3 |" G# t/ h
- nameMaked="<@>".join(matchCharList)
0 t* j; f. j7 s( b6 {( l - #生成标题部分
- y S4 y( h/ o! C. d - txtTitle=nameMaked+"\n"! c5 P8 V3 ]5 h [
- #读入文件4 y% E, g D5 @
- txtContent=readTxt(fileName) + L8 l+ N! a' X4 L; s( _
- #替换正文中的特殊字符, J. d* G1 C( h0 W
- strList=["\n","<br>"]
& T1 a, n/ j8 H4 L: @" j7 w - for ch in strList:8 D7 m4 k3 c4 a4 ?/ y6 X
- txtContent=txtContent.replace(ch,"<BR>")5 G* \# J V }8 g
- #合成整个词条的内容
, E) H2 D8 e$ x; y - txt=txtTitle+txtContent+"<BR>\n</>"/ w; T6 j) n- k/ d& |$ w
- return txt
. i0 c& g4 C: u G
4 A7 U6 W8 r# l. M6 [$ z4 j- def mdxFormat(path,outputPath):
5 ?9 u+ Y p8 d% I - '''
- A6 g, J: h% S% o5 j - func:格式化文本
: {6 ^3 d" A9 z5 l; S' `4 U( j - :param path:待格式化文本的路径' b+ H; ~% G' U6 n# [% i4 o4 h
- :param outputPath:输出的路径和文件名
4 w8 h) y- J* ?% O - '''5 w3 i$ K: a2 |* D1 @+ h/ f4 I
- fileList=getFileListAll(path)#获取指定目录下的全部文件,包括子目录中的文件6 L" |1 |. A" F/ N6 m: w
- nameList=getfileName(fileList)
& C9 A( |, {- n$ i6 x8 l& Q# u - if isTxts(nameList):) o1 Q* ], p3 v( B: Z
- if showDupFile(nameList):#如果没有重复文件
" }& e( U0 a) N* _4 i, F- q - print("找到了{0:}个txt文档,".format(len(nameList)))
: g( y/ M* m9 i& i# o
+ g, a" q: g% Q# H8 ^9 K- print("开始格式化、合并文档:") 3 Z2 U+ D/ \- c _5 A
- count=0#txt文档数量统计! b* B% v j5 B& P8 ] s3 `
- txtList=[]#存储文件内容
' j+ c3 t1 s& E5 ~' s0 b; S - for fileName in fileList: 9 f+ M/ D$ m5 U$ g! j% f
- txt=formatText(fileName) #转换换行和<br>为<BR> / |: J6 d5 n0 X$ v# u3 P& \
- txtList.append(txt)
" s4 w2 ^, r. i: Q9 k1 H1 s, u" _ - count+=13 @+ W3 M0 n8 K u3 y; X4 C' b
- print("\r已完成第{0: ^6}个".format(count),end="")
0 f9 F) z* A6 w" p - txt="\n".join(txtList)
$ |8 @9 b: l! M6 J9 i7 X6 d - : I: ^8 p! N) l
- #添加超链接3 X% t# \5 l- V3 R7 ^/ v5 d
- txt=formatTxtHref(nameList,txt)
: m, L) c" a& a: w( X - print("开始写入文本,")
; i G+ A0 r5 A% G1 t, s* s. _! d- t( i - if writeTxt(txt,outputPath):+ h0 i# g- |4 B& O+ ]. d1 D- y
- print("文件合并输出成功!")
6 C5 w+ L6 u, Y - else:! K7 ^% D+ a [9 @) D( z
- print("Error:Merge!")+ s7 }# N. B0 U/ t1 H; i
- return 3 g$ K8 L6 Y" ~( A6 q
- ' w) j [* m* e" Q. v% z1 ^3 T
- def main():3 A5 k& ^' f& Q6 f# q2 L9 A8 D
- timeStart = time.time()
8 R2 j5 u5 m5 ^5 I( ^: S - 3 ^ T6 ]3 ?, D3 B. o; A
! q; }! d- X; |
1 I& ~" W _9 S: L) [- path=r"/Users/vivian/Downloads/mdxFormat_upload_V3.5/test"
1 e8 N7 H% ]( g. J: E; z5 T1 a% U7 s - " w) u" t& J8 I7 S' V( y9 L
- outputPath=r"/Users/vivian/Downloads/mdxFormat_upload_V3.5/output/demo.txt"8 N# c" ~7 O3 e( C: l. P& @
-
8 x: j* c. J" W# n3 F& U1 S% ?) W - mdxFormat(path,outputPath)
7 b. Y& I P' k1 N - 0 ~' m, B/ D6 R! E4 a. V r% Q$ e
- timeEnd = time.time()
1 D& u; j2 @/ _ - print("程序运行了%d秒"%(timeEnd-timeStart))4 c& i: b9 X3 R# j: S( {$ L- K
- % d. U& i6 V& `! o7 x2 l8 B1 r
- if __name__ == '__main__':
) L- R0 m8 u( l' T - main()
, |% R" }& y" t! w: ]0 s- J7 }
复制代码 |
|