TA的每日心情 | 擦汗 2021-11-17 09:18 |
---|
签到天数: 79 天 [LV.6]常住居民II
|
发表于 2021-10-7 07:59:27
|
显示全部楼层
+ {! R$ x) V ?6 A
好的!! 麻烦了
* O: ?- j' g- F8 V* f% _. V( _ c% C% A6 x( Y% r) z4 x6 d
- import os
0 j9 d& n" Q2 b! s! }. k& C - import glob
2 l* ^1 i6 Z+ o5 T4 `% s( D0 | - import re
" q# C' l( [8 \# D4 X! j" f - import time
9 J S t: V% V; q2 t - . v; Y+ T5 h6 c
- #遍历所有子文件
( P% V* O5 z# I7 }4 n' r3 j# B2 e t - def getFileListAll(filePath):
0 L/ E o+ f& n, |+ J o2 |$ H - filelist=[]
0 F) j4 j0 q+ i O2 {" B6 V1 ~ - for root, dirnames, filenames in os.walk(filePath):! l5 P% q- q; ^% ?0 \/ J, f
- for filename in filenames:! ^! D( D! q8 u# z/ y+ t) \
- filelist.append(os.path.join(root,filename))
2 i2 C: ^1 @) n& W! o2 R! V/ [0 H - #print(os.path.join(root,filename))
7 k H" t) ~) ]; ~% s - return filelist. _- [! J9 n: _7 Z$ \4 ~
- ' |1 R; S; K; ~& ?3 c% ]
- #判断是否为txt文档2 b/ y" U( `4 u* F6 `1 h( Z1 n
- def isTxts(nameList):: a: M: R" r0 ?, G* M
- pattern=r"^[^~$].+\.(txt)$"
4 ^2 {5 [3 @$ A7 H2 Y - patternObj=re.compile(pattern,re.I)
! `, @- A9 {( K - notTxtList=[]' {# @1 w; G# r! Z8 i
- for fileName in nameList: V" {& @1 c* J4 [. C# u
- if not patternObj.fullmatch(fileName): #判断是否为txt文件
& W$ w) B/ b0 H - notTxtList.append(nameList)
( P$ X' g: A: N, b8 s - if notTxtList:
6 d M& C% H; {. w; n% f4 u! L - print("存在非txt文件:{0:},请处理后再运行程序!".format(notTxtList))
4 p! L3 g1 s; U8 f8 S* K - return False
9 P1 g, U6 `+ W6 r* J5 _! I! T - else:' n" C4 i6 g; z5 c" o, B% O, f! D" k) P
- return True
% e% ~3 ?8 d0 m5 p T' r
5 \3 l4 F7 @4 M8 ?, Y, |& Q5 `- #在指定的txt文档中查找关键字
2 ^% @4 t% @% g0 j: G9 J( z - def readTxt(fileName):#strkey:查找的关键字;fileName:文件路径
/ N4 A% k2 w' k" M8 e; g4 b, d4 g - pattern=r"^[^~$].+\.(txt)$"
$ ]) H+ n7 g& M# \) \& L& @ H - patternObj=re.compile(pattern,re.I)
' d$ x' }3 _7 [7 d2 z - if patternObj.fullmatch(fileName): #判断是否为txt文件
5 R9 l& f3 q$ j5 J: W - try:8 C9 C" s2 k8 m0 m
- f=open(fileName,"r",encoding="utf-8")
5 @+ C1 B& W8 l' u! E9 S - txt=f.read()* C( m, D* V! R; z0 L
- f.close()
1 U8 q ^ O+ ?" y- n6 w - except:6 F, F* ]) ~$ K& ^: O" I
- print("读取文档失败:{0:}".format(fileName))) s9 `! P2 e5 O
- return ""
8 R. j2 C: Z& |0 i - else: #无异常时,执行
- ~9 m- c# z. P' F, ?# o- ^' @6 M - return txt
o8 X* n& G8 N - #finally: #无论是否异常,都执行7 L8 s5 ? c/ u5 q
! ~+ H) G' T& D r8 |- def writeTxt(txt,outPath):
; `9 B! W2 z* I - path="\".join(outPath.split("\")[0:-1])* {4 \/ i3 w' Q( ` z
- if not os.path.exists(path):
# f7 i+ j, Y& a% k* T% [ - os.mkdir(path) 4 w2 _1 m' h& u6 Q# l# y% m; `
- try:
& F* V/ K5 P+ N: N3 d - f=open(outPath,"a",encoding="utf-8")# b9 ]6 A7 |8 k. c0 `5 C5 W3 i
- f.write(txt)- t t4 S) {# g
- f.close()
. f! `& w8 f# ^2 f6 p - return True 7 m# J% J" n2 v8 Z1 B2 ^. U
- except:
Z: k5 @1 ?* W: S9 W% ]/ ~ - print("写入文档失败:{0:}".format(outPath))$ R! s# c) C/ S6 c( L
- return False* O. n9 x! A* n- j& t8 s J
- #else: #无异常时,执行
! [4 z. B* k c; x+ Z( a - # return txt' n4 }: j( Q/ P+ V2 Q
- #finally: #无论是否异常,都执行
3 c; M* ]: G; |9 j% I4 i& V
( x$ U& s5 n8 m/ a- def getfileName(fileList):$ s/ B5 P d; v
- '''5 Q+ b& O, @# @+ r7 ? ~% n; r7 T: k
- :param fileList:文件路径列表
4 _, B4 P1 U$ s/ Q6 g - :return nameList:文件名称列表有扩展名
* z* S4 e# M$ { v0 O - '''
?4 y* t. O- w6 W8 @% i/ ` - nameList=[]
, ^& U1 R5 ^% A; V% n0 `5 P - for fileName in fileList:8 f4 \4 N1 s, V9 m1 v
- name=fileName.split("\")[-1]#提取文件名
# ]% c* ~3 M$ b! K R& v) V4 n2 b - nameList.append(name)3 T. G A3 q$ p5 u- w
- return nameList
. F, x8 K: P) D. L$ o2 y7 `! L/ { - + W3 x& i% B. W
- #显示重复的文件名,如果有重复文件显示重复的文件名称,并返回False,否则返回True" f% N( w7 G, h# O4 c2 Y: E) f
- def showDupFile(nameList):
$ a/ g+ R, \6 P/ ~2 C; b& c - '''
" m- D. c; U# B% k2 o+ S$ V - 判断是否有文件名相同的文件
0 R$ _1 v. O1 V+ s6 v! O, [ - :param fileList:文件名列表,包含绝对路径
+ }4 K5 j- l: s# F$ U4 f5 w7 X - :return: 如果有重复文件显示重复的文件名称,并返回False,否则返回True
2 {( W$ O0 Y, h" w: p2 P$ U - '''
) c- @3 n$ m" s2 O2 r - if nameList: #如果文件名列表不为空
% ^ i2 n( y0 B% B5 k - nameSet=set(nameList)
! A% a5 U5 h' o; \+ t7 W, m - DupNameList=[], m6 ~! Z7 K' Z# D
- for item in nameSet:& K+ c* P0 t' O
- if nameList.count(item)>1:
1 J' H! {0 Y5 w2 l - DupNameList.append(item): d5 C3 C0 k4 q; c5 q6 p
- if DupNameList:#保存重复文件名的列表
+ ]% |; c$ f& l9 E# w- t( p - for L in DupNameList:" F. B! U# x* O0 X
- print("{0:}为重复的文件,请处理!".format(L)) ( R1 D: {$ g3 W2 Y
- return False
8 \, |2 U" L0 N) L - else:3 J: Q V7 u1 R8 w& a& _0 ^, n7 \2 b: c
- print("没有发现文件名相同的文件,处理中...") " k" E2 t( u: b# [
- return True
) v/ |# \' _6 q! s - 5 ^- v% i7 {2 q/ x
- def nameListSort(nameList,rev=True):( k* ?! C0 h& C0 B9 o# w/ V
- '''
' D" U& J, \8 ^ - 按照字符串长度排序) N: Y: ~: G5 K* C
- :param fileList:文件名列表9 l) ]6 M' C, [6 g( {6 P9 J6 h. y1 M
- :param reverse:默认降序
+ j2 R7 [$ D) N" X9 x - :return newNameList:返回排序后的文件名新列表
% q U9 z0 o, {& P - '''
/ `$ J! S, q3 Y3 g6 Q4 ^ - newNameList = sorted(nameList,key = lambda i:len(i),reverse=rev) #按照字符串长度排序,降序5 M: q7 w6 G8 H4 [( Z0 o$ M9 ~+ A
- return newNameList 1 Q) J3 K, W% b/ i' V
% @; p% i' ]" ]/ T4 v* U5 }+ S- #添加超链接
' u$ `& C6 b& @! }; `: u \6 I - def formatTxtHref(nameList,txt):- [7 _- j+ z5 c3 b
- '''
, {4 a% b8 x1 O) T - func:在文本中为特定字符串添加超链接+ K0 \, X- d! e6 |6 o' l
- :param hrefStr:待添加超链接的字符串
1 O" z" P1 n; J6 } - :param txt:文本字符串+ D4 ]& E: \' s
- :return txt:格式化的文本字符串
+ \7 W K2 {1 \" x" J2 I& z - '''8 W- A4 v# n$ P5 u% \# A
- print("\n开始循环添加超链接关键词辅助标记:") _% R% K6 p. |# E# t1 `% j* p
- count=0 d2 s n! u6 s) q0 V9 d) T
- nameListDescend=nameListSort(nameList)0 [$ r r. v" }4 Z7 b
- for nameD in nameListDescend:
) H, n9 B0 H# M2 C9 f& {0 O* V - nameDSimple=nameD.replace(".txt","")
. { _$ ^: V) M, \. n6 [ - nameDSMarked="【@"+nameDSimple+"@】"
' S; \, M) e( h" r+ `: i - txt=txt.replace(nameDSimple,nameDSMarked)+ V2 L q: Z4 }
- count=count+1* i' Z4 x4 }7 Z0 x% m( C* w
- print("\r已完成第{0: ^6}个".format(count),end="") 9 C4 o9 d) y h( Q/ T0 M+ Q
) o' |+ K/ l# u1 a1 j, N- print("\n开始循环清理嵌套的冗余的超链接关键词辅助标记:")2 Y# l" s$ H' S
- count=0
' j* e! ?9 P6 R' m, V. }5 q - nameListAscend=nameListSort(nameList,rev=False)* b/ I u0 A" ~
- for nameA in nameListAscend:( b4 } n9 |2 h6 b W9 I: n: B
- nameA=nameA.replace(".txt","")
" \+ q ?5 @/ Y5 { - pattern=r"【@([^@】]*?)【@{0:}@】".format(nameA)
% a' _% g" F9 G* \/ U( ? - patternObjTxt=re.compile(pattern)4 p# s/ k: J: u
- toHrefStr="【@"+"\\1"+"muyubug"+nameA
" [9 t, k% V3 k$ \ - txt=patternObjTxt.sub(toHrefStr,txt,count=0) #替换为的字符串. E- |7 n n; p: R: Q- A
- + I [- s3 Y9 x' ?: ~0 x
- pattern=r"【@{0:}@】([^【@]*?)@】".format(nameA) ' h' K' C, |) l% y6 Q) Z& j
- patternObjTxt=re.compile(pattern)/ w6 ] w& S5 L: c* p
- toHrefStr=nameA+"\\1"+"muyubug"+"@】"
0 H, {1 F# z% ]3 i: K+ c - txt=patternObjTxt.sub(toHrefStr,txt,count=0) #替换为的字符串5 T, M. t, G! I% ?( {0 Z6 g" m; [
- txt=txt.replace("muyubug","")2 G$ N) {2 e# v! Q* e# q$ l/ J! P
- count=count+1
$ r) j: j2 n+ W3 H+ h# x2 J2 h4 u6 G5 z - print("\r已完成第{0: ^6}个".format(count),end="")! |: [+ V3 V6 m, l9 v
-
+ l4 L8 e1 Q7 C- Q5 q - print("\n开始添加超链接,")
. D8 n: N! B' k2 S& ^0 N - pattern=r"【@([^【@]+?)@】"
3 O9 C0 x5 a& j- R! m% d - patternObjTxt=re.compile(pattern)
K7 G- I* H8 X; Q: n3 X - toHrefStr=r"<a href='entry://\1'>\1</a>"" t7 F0 q$ M: W8 B
- txt=patternObjTxt.sub(toHrefStr,txt,count=0) #替换为的字符串* [5 c1 Y& X& L3 i9 F, X2 s
- 9 H: w) [6 }+ M. S/ k/ E; r8 ]0 ~- o
- print("开始清理辅助标记,")
; n; S- E, t. `1 T _7 P+ o - txt=txt.replace("<@>",""): W6 g" Y3 O& S/ m
- return txt
5 C9 }. P; B7 D8 D8 z3 c
8 K, c6 V( J m6 t& J c- def formatText(fileName):2 W1 R. f( e1 [& F
- '''0 T5 _7 O1 J9 F5 m' I( q
- func:清洗字符串,格式化' J/ @8 b; S8 b, b7 X
- :param fileName:待处理文件
2 S% b. Y) n* G1 y - return txt:处理后的字符串 ] r* W$ X8 T7 Z+ t
- '''
# B; y. p. f$ N, q2 W3 G; l3 q - #提取文件名
, q, J- C1 B9 |; s/ X. f - name=fileName.split("\")[-1].replace(".txt","")# i6 [1 g4 U6 K( Q
- #在文件名的每个字符键添加标记符& ]2 G6 Y$ ?/ G$ O2 [0 W- R ]
- pattern = re.compile('.{1,1}')
p" n0 W. W# v0 a - matchCharList=pattern.findall(name)0 a6 Q& ]) q9 r Q/ O" I1 a& y$ O
- nameMaked="<@>".join(matchCharList)3 @+ t4 T& \9 \) b& p& o
- #生成标题部分
- O5 W: Z8 ]% r$ X) ]5 R - txtTitle=nameMaked+"\n"
( w K( R3 S' z0 q0 L2 s5 K - #读入文件
* @( D. }! T& y - txtContent=readTxt(fileName) 8 N# g- @4 I8 |% q2 _+ \
- #替换正文中的特殊字符$ k2 Z2 c$ y% e
- strList=["\n","<br>"]
- @$ p3 v& h, t4 q9 w - for ch in strList:
* F6 ]6 y5 `- u7 Q+ W0 n4 d - txtContent=txtContent.replace(ch,"<BR>")
! K3 Z0 |9 q1 b6 D$ _+ F; h, P: j - #合成整个词条的内容
6 }. y1 m; u! W! D - txt=txtTitle+txtContent+"<BR>\n</>"
" i; _! q" V! f* R k$ |% ` - return txt3 G9 t2 s q' `0 _1 h Y+ k
0 J* {' C, q6 b/ ] `- def mdxFormat(path,outputPath):- ~# _& {) K3 ]9 ]
- '''
. q) V+ \" B5 B% i C3 _, `* T - func:格式化文本! U" g7 y# n5 @) M/ l* r( Z
- :param path:待格式化文本的路径0 t8 a% X& @. r, ?
- :param outputPath:输出的路径和文件名! m, Q+ j1 N+ _4 t& g0 \' Y: ^
- '''
/ w$ G/ S( p9 G - fileList=getFileListAll(path)#获取指定目录下的全部文件,包括子目录中的文件+ y2 `& `4 ^: m8 _: S3 ~/ b8 [% U6 f
- nameList=getfileName(fileList)0 l g/ P! g4 F& C* L9 m
- if isTxts(nameList):1 r! F M0 E- D! W6 F6 @8 E8 C z
- if showDupFile(nameList):#如果没有重复文件! e- x0 S$ R& X. j/ T
- print("找到了{0:}个txt文档,".format(len(nameList)))
. a U0 l* u: ] - 5 j2 _' m0 P0 S
- print("开始格式化、合并文档:")
+ {% V4 M; ~( R f/ k0 J - count=0#txt文档数量统计! h- g9 d- I$ u, m- D
- txtList=[]#存储文件内容1 e9 j2 k% L# Z" i u
- for fileName in fileList:
( O6 T& _, }1 M4 f9 H - txt=formatText(fileName) #转换换行和<br>为<BR> . B' U1 l* d, J
- txtList.append(txt)" s% `) k" u" }! _% |
- count+=1* f3 e0 j! B- v& }9 K8 o
- print("\r已完成第{0: ^6}个".format(count),end="")
8 b4 F, P- ~0 ^% T; o* L - txt="\n".join(txtList)
* X2 q; m$ ?9 z7 n4 T -
( {# ~7 t) [$ h% e* _0 Q8 C E - #添加超链接! \0 r5 }' G" o. s
- txt=formatTxtHref(nameList,txt)
$ K. o5 q, }3 t" R1 @9 r3 T - print("开始写入文本,")
' Y5 Y& M! W" @$ j - if writeTxt(txt,outputPath):
# D$ p4 Q8 Y! r2 A - print("文件合并输出成功!")
7 S/ x% {4 P$ X; u# m. K - else:8 h& S" W% R- c# G: i1 ?% a. r, @
- print("Error:Merge!")
9 d' L# J1 y' `/ J6 X4 {4 F - return
8 i: [( c8 _) c9 ?" P! A" o -
+ h5 {; a: j4 ]% R d. X$ h - def main():
; Q' t9 s. s/ n- k: @. ]6 @ - timeStart = time.time()
) g# z1 j8 }, ` I' N - 0 o1 x. O0 B+ s2 k$ Y' M
- 1 ]6 c$ @( C# K7 G
- - G. I' d$ B% _7 ]% B/ u
- path=r"/Users/vivian/Downloads/mdxFormat_upload_V3.5/test"
D7 d- h, n( E7 {. Z -
9 {6 u8 J0 U$ z5 E8 W$ V2 e! m0 y - outputPath=r"/Users/vivian/Downloads/mdxFormat_upload_V3.5/output/demo.txt"
" G$ G8 w) D; X5 \ -
! V( G; a3 L1 X: K+ r8 G6 P9 | - mdxFormat(path,outputPath)+ ]" U7 N$ i. C1 ?7 l
- 2 Q9 L5 W2 @% S! d; u# A( G) f
- timeEnd = time.time()7 ]. S( @0 L# j3 y7 n: u1 W: [
- print("程序运行了%d秒"%(timeEnd-timeStart))& K) O$ p# e$ g+ F
6 t; w# ]8 G( _ V2 }9 d- if __name__ == '__main__':
" w0 k! H+ W' Z2 G8 x% q - main()4 r$ @3 t# Y! t3 s2 G
复制代码 |
|