txt文本格式化工具mdxFormat（python3.9.5编写）

muyuzhth0 发表于 2021-7-25 10:47:05

本工具适用python3.9.5编写。可以把指定路径（包含子目录）下的txt文档格式化为mdxbuilder使用的格式（每个词条三行：key，value，</>）。
特点：
1.支持不同编码格式的txt文档，默认输入输出均为“gbk”编码。
2.我写这个工具是为了整理个人的常用资料，为关键词添加超链接，实现跳转。
ps：我以前一直使用其他人写好的程序格式化txt文档，但是如果关键字之间存在嵌套，超链接格式化有问题，所以我自己编写了这个程序，但还有一点不完美之处，就是不能很好的解决自身跳转的超链接。望大神指点！

muyuzhth0 发表于 2021-7-26 12:34:25

n0thing 发表于 2021-7-26 02:18
能不能發一個 .txt 的範例？

txt文档的文件名为关键字、文件内容就是词典内容。

我主要为了实现资料间的超链接跳转。

n0thing 发表于 2021-7-26 02:18:24

能不能發一個 .txt 的範例？

kyletruman 发表于 2021-7-25 11:58:06

默认输入输出均为“gbk”编码能不能默认输出UTF-8编码呢？“gbk”编码不方便文本编辑啊{:4_109:}

Mandolin 发表于 2021-7-25 12:23:14

支持mdx制作脚本

shuwushimang 发表于 2021-7-25 13:43:57

感谢分享

muyuzhth0 发表于 2021-7-25 14:34:00

本帖最后由 muyuzhth0 于 2021-7-25 14:38 编辑

kyletruman 发表于 2021-7-25 11:58
能不能默认输出UTF-8编码呢？“gbk”编码不方便文本编辑啊

我制作的txt文档源文件是gbk格式的，如果您的是utf-8，只需要把writeTxt（）函数里，f=open(outPath,"a",encoding="ANSI")这条语句的ANSI修改为utf-8即可。

muyuzhth0 发表于 2021-7-25 14:34:45

本帖最后由 muyuzhth0 于 2021-7-25 14:40 编辑

输出默认是gbk

漫步云海涧 发表于 2021-10-6 20:16:02

我的版本是 3.9.7, 遇到了问题。请帮助

❯ python mdxFormat_upload_V3.5.py
没有发现文件名相同的文件，处理中...
找到了4个txt文档，
开始格式化、合并文档：
读取文档失败:/Users/vivian/Downloads/mdxFormat_upload_V3.5/test/药品检查管理办法.txt
已完成第1 个读取文档失败:/Users/vivian/Downloads/mdxFormat_upload_V3.5/test/我的广告法管理条例.txt
已完成第2 个读取文档失败:/Users/vivian/Downloads/mdxFormat_upload_V3.5/test/北京市反食品浪费规定.txt
已完成第3 个读取文档失败:/Users/vivian/Downloads/mdxFormat_upload_V3.5/test/中国共产党组织工作条例.txt
已完成第4 个
开始循环添加超链接关键词辅助标记：
已完成第4 个
开始循环清理嵌套的冗余的超链接关键词辅助标记：
已完成第4 个
开始添加超链接，
开始清理辅助标记，
开始写入文本，
Traceback (most recent call last):
File "/Users/vivian/Downloads/mdxFormat_upload_V3.5/mdxFormat_upload_V3.5.py", line 219, in <module>
main()
File "/Users/vivian/Downloads/mdxFormat_upload_V3.5/mdxFormat_upload_V3.5.py", line 213, in main
mdxFormat(path,outputPath)
File "/Users/vivian/Downloads/mdxFormat_upload_V3.5/mdxFormat_upload_V3.5.py", line 198, in mdxFormat
if writeTxt(txt,outputPath):
File "/Users/vivian/Downloads/mdxFormat_upload_V3.5/mdxFormat_upload_V3.5.py", line 48, in writeTxt
os.mkdir(path)
FileNotFoundError: No such file or directory: ''

muyuzhth0 发表于 2021-10-6 22:34:08

漫步云海涧发表于 2021-10-6 20:16
我的版本是 3.9.7, 遇到了问题。请帮助

mdxFormat读取文件的路径是绝对路径。你得在mdxFormat_upload_V3.5.py-中设置下文本文件的路径。有个变量“path”

muyuzhth0 发表于 2021-10-6 22:47:27

漫步云海涧发表于 2021-10-6 20:16
我的版本是 3.9.7, 遇到了问题。请帮助

你把在“mdxFormat_upload_V3.5.py”中修改的代码贴出来看下。

漫步云海涧 发表于 2021-10-7 07:59:27

muyuzhth0 发表于 2021-10-6 22:47
你把在“mdxFormat_upload_V3.5.py”中修改的代码贴出来看下。

好的！！麻烦了

import os
import glob
import re
import time

#遍历所有子文件
def getFileListAll(filePath):
filelist=[]
for root, dirnames, filenames in os.walk(filePath):
 for filename in filenames:
 filelist.append(os.path.join(root,filename))
 #print(os.path.join(root,filename))
return filelist

#判断是否为txt文档
def isTxts(nameList):
pattern=r"^[^~$].+\.(txt)$"
patternObj=re.compile(pattern,re.I)
notTxtList=[]
for fileName in nameList:
 if not patternObj.fullmatch(fileName): #判断是否为txt文件
 notTxtList.append(nameList)
if notTxtList:
 print("存在非txt文件：{0:},请处理后再运行程序！".format(notTxtList))
 return False
else:
 return True

#在指定的txt文档中查找关键字
def readTxt(fileName):#strkey:查找的关键字；fileName：文件路径
pattern=r"^[^~$].+\.(txt)$"
patternObj=re.compile(pattern,re.I)
if patternObj.fullmatch(fileName): #判断是否为txt文件
 try:
 f=open(fileName,"r",encoding="utf-8")
 txt=f.read()
 f.close()
 except:
 print("读取文档失败:{0:}".format(fileName))
 return ""
 else: #无异常时，执行
 return txt
 #finally: #无论是否异常，都执行

def writeTxt(txt,outPath):
path="\\".join(outPath.split("\\"))
if not os.path.exists(path):
 os.mkdir(path)
try:
 f=open(outPath,"a",encoding="utf-8")
 f.write(txt)
 f.close()
 return True
except:
 print("写入文档失败:{0:}".format(outPath))
 return False
 #else: #无异常时，执行
 # return txt
#finally: #无论是否异常，都执行

def getfileName(fileList):
'''
:param fileList:文件路径列表
:return nameList:文件名称列表有扩展名
'''
nameList=[]
for fileName in fileList:
 name=fileName.split("\\")[-1]#提取文件名
 nameList.append(name)
return nameList

#显示重复的文件名，如果有重复文件显示重复的文件名称，并返回False，否则返回True
def showDupFile(nameList):
'''
判断是否有文件名相同的文件
:param fileList:文件名列表，包含绝对路径
:return: 如果有重复文件显示重复的文件名称，并返回False，否则返回True
'''
if nameList: #如果文件名列表不为空
 nameSet=set(nameList)
 DupNameList=[]
 for item in nameSet:
 if nameList.count(item)>1:
 DupNameList.append(item)
 if DupNameList:#保存重复文件名的列表
 for L in DupNameList:
 print("{0:}为重复的文件，请处理！".format(L))
 return False
 else:
 print("没有发现文件名相同的文件，处理中...")
 return True

def nameListSort(nameList,rev=True):
'''
按照字符串长度排序
:param fileList:文件名列表
:param reverse:默认降序
:return newNameList:返回排序后的文件名新列表
'''
newNameList = sorted(nameList,key = lambda i:len(i),reverse=rev) #按照字符串长度排序，降序
return newNameList

#添加超链接
def formatTxtHref(nameList,txt):
'''
func：在文本中为特定字符串添加超链接
:param hrefStr:待添加超链接的字符串
:param txt:文本字符串
:return txt:格式化的文本字符串
'''
print("\n开始循环添加超链接关键词辅助标记：")
count=0
nameListDescend=nameListSort(nameList)
for nameD in nameListDescend:
 nameDSimple=nameD.replace(".txt","")
 nameDSMarked="【@"+nameDSimple+"@】"
 txt=txt.replace(nameDSimple,nameDSMarked)
 count=count+1
 print("\r已完成第{0: ^6}个".format(count),end="")

print("\n开始循环清理嵌套的冗余的超链接关键词辅助标记：")
count=0
nameListAscend=nameListSort(nameList,rev=False)
for nameA in nameListAscend:
 nameA=nameA.replace(".txt","")
 pattern=r"【@([^@】]*?)【@{0:}@】".format(nameA)
 patternObjTxt=re.compile(pattern)
 toHrefStr="【@"+"\\1"+"muyubug"+nameA
 txt=patternObjTxt.sub(toHrefStr,txt,count=0) #替换为的字符串

 pattern=r"【@{0:}@】([^【@]*?)@】".format(nameA)
 patternObjTxt=re.compile(pattern)
 toHrefStr=nameA+"\\1"+"muyubug"+"@】"
 txt=patternObjTxt.sub(toHrefStr,txt,count=0) #替换为的字符串
 txt=txt.replace("muyubug","")
 count=count+1
 print("\r已完成第{0: ^6}个".format(count),end="")

print("\n开始添加超链接，")
pattern=r"【@([^【@]+?)@】"
patternObjTxt=re.compile(pattern)
toHrefStr=r"<a href='entry://\1'>\1</a>"
txt=patternObjTxt.sub(toHrefStr,txt,count=0) #替换为的字符串

print("开始清理辅助标记，")
txt=txt.replace("<@>","")
return txt

def formatText(fileName):
'''
func：清洗字符串，格式化
:param fileName:待处理文件
return txt:处理后的字符串
'''
#提取文件名
name=fileName.split("\\")[-1].replace(".txt","")
#在文件名的每个字符键添加标记符
pattern = re.compile('.{1,1}')
matchCharList=pattern.findall(name)
nameMaked="<@>".join(matchCharList)
#生成标题部分
txtTitle=nameMaked+"\n"
#读入文件
txtContent=readTxt(fileName)
#替换正文中的特殊字符
strList=["\n"," "]
for ch in strList:
 txtContent=txtContent.replace(ch," ")
#合成整个词条的内容
txt=txtTitle+txtContent+" \n</>"
return txt

def mdxFormat(path,outputPath):
'''
func:格式化文本
:param path:待格式化文本的路径
:param outputPath:输出的路径和文件名
'''
fileList=getFileListAll(path)#获取指定目录下的全部文件，包括子目录中的文件
nameList=getfileName(fileList)
if isTxts(nameList):
 if showDupFile(nameList):#如果没有重复文件
 print("找到了{0:}个txt文档，".format(len(nameList)))

 print("开始格式化、合并文档：")
 count=0#txt文档数量统计
 txtList=[]#存储文件内容
 for fileName in fileList:
 txt=formatText(fileName) #转换换行和 为 
 txtList.append(txt)
 count+=1
 print("\r已完成第{0: ^6}个".format(count),end="")
 txt="\n".join(txtList)

 #添加超链接
 txt=formatTxtHref(nameList,txt)
 print("开始写入文本，")
 if writeTxt(txt,outputPath):
 print("文件合并输出成功！")
 else:
 print("Error:Merge!")
 return

def main():
timeStart = time.time()

path=r"/Users/vivian/Downloads/mdxFormat_upload_V3.5/test"

outputPath=r"/Users/vivian/Downloads/mdxFormat_upload_V3.5/output/demo.txt"

mdxFormat(path,outputPath)

timeEnd = time.time()
print("程序运行了%d秒"%(timeEnd-timeStart))

if __name__ == '__main__':
main()

muyuzhth0 发表于 2021-10-7 19:54:52

漫步云海涧发表于 2021-10-7 07:59
好的！！麻烦了

你修改了哪条?

muyuzhth0 发表于 2021-10-7 21:48:13

漫步云海涧发表于 2021-10-7 07:59
好的！！麻烦了

path=r"/Users/vivian/Downloads/mdxFormat_upload_V3.5/test"
outputPath=r"/Users/vivian/Downloads/mdxFormat_upload_V3.5/output/demo.txt"
这两条有问题吧，应该是下面这个格式
r"D:\U盘同步\百科\00txt词条汇总"

漫步云海涧 发表于 2021-10-8 07:10:18

muyuzhth0 发表于 2021-10-7 21:48
path=r"/Users/vivian/Downloads/mdxFormat_upload_V3.5/test"
outputPath=r"/Users/vivian/Downloads/md ...

我用的是mac电脑， path 是放的存放txt词条的目录， outputpath 是输出的txt文件

lixiaoshun 发表于 2022-2-18 10:09:24

弄个windows操作界面该多好

muyuzhth0 发表于 2022-2-18 19:10:21

lixiaoshun 发表于 2022-2-18 10:09
弄个windows操作界面该多好

sorry！算法还不完善，需要的人也不多，空闲时间比较少....暂时没有写GUI的打算

页: [1]

掌上百科 - PDAWIKI's Archiver

txt文本格式化工具mdxFormat（python3.9.5编写）