# 파이썬으로 대량 문서 작업 소스
올 한해는 문서만 작성하다가 끝났다....
그래도 sap 프로젝트때 파이썬으로 한번 문서 정리 하고 나니까 이번 탈오라클 프로젝트에서는 수월하게 작업했던거 같다.
좀 지저분하긴 한데 다음에 또 쓸일 있으면 정리 되겠지....
프로젝트 하면서 5000개 문서 분류 및 rename, 1000개 쿼리 엑셀로 정리한 파이썬 소스.
해당 소스는 anaconda3 jupyter notebook 에서 실행하였다.
# python으로 마이바티스 쿼리 엑셀로 작성
xmltodict.parse(data)
이게 키포인트 였음.
#anaconda prompt> pip install xmltodict
import xmltodict
import json
from openpyxl import Workbook
from pathlib import Path
from openpyxl import load_workbook
wb = load_workbook(filename = '쿼리조사.xlsx')
smtc = wb['커넥션 조사']
#newwb = Workbook()
#newwb = load_workbook(filename = 'smtcServer쿼리조사.xlsx')
#ws1 = newwb.active
#ws1 = newwb['persist-impl']
#ws1.append(['번호','ecdDataSource','impl'])
i=1
for row in smtc.iter_rows(min_col=2, min_row=53 ,max_col=2 ,max_row=83):
for cell in row:
print(cell.value, end=" ")
my_file = Path('C:\\workspace\\smart4c_mip_all\\resource\\spring\\persists\\'+cell.value)
if my_file.is_file() == False:
continue
with open('C:\\workspace\\smart4c_mip_all\\resource\\spring\\persists\\'+cell.value, 'r',encoding='utf-8') as f:
data = f.read()
doc = xmltodict.parse(data)
doc = doc['beans']['bean']
#print(json.dumps(doc, indent=4))
if len(doc)==2 :
for bean in doc:
if '.incrementer.' not in bean['@class']:
#print(cell.value, ' ', bean['@class'])
classid = bean['@class']
print(i, cell.value, classid.split('.')[-1]+'.java')
smtc['C'+str(i)] = classid.split('.')[-1]+'.java'
#ws1.append([i,cell.value,classid.split('.')[-1]+'.java'])
else :
#print(cell.value, ' ',doc['@class'])
classid = doc['@class']
print(i, cell.value, classid.split('.')[-1]+'.java')
smtc['C'+str(i)] = classid.split('.')[-1]+'.java'
#ws1.append([i,cell.value,classid.split('.')[-1]+'.java'])
i = i+1
#if 'impl' in bean['@class']:
#print(bean['@class'])
#print(doc)
#doc = doc['@class'].split('.')[-1]+'.java'
#print(doc)
#print(json.dumps(doc, indent=4))
#newwb.save('smtcServer쿼리조사.xlsx')
#newwb.save('smtcServer쿼리조사.xlsx')
#print(smtc['a1'].value)
#smtc['a1'].value = 'ttttttttttttttttttttt\ntttttttt\nttttttt\nttt'
wb.save('쿼리조사.xlsx')
#sheet['A1'] = 1
#sheet.cell(row=2, column=2).value = 2
#book.save('write2cell.xlsx')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import re
from pathlib import Path
import glob
from openpyxl import load_workbook
newwb = load_workbook(filename = '쿼리조사.xlsx')
ws1 = newwb['커넥션 조사']
ws2 = newwb.create_sheet("상품이외2")
ws2.append(['번호','ecdDataSource','impl','queryid'])
i=1
for row in ws1.iter_rows(min_col=1, min_row=53 ,max_col=4 ,max_row=53):
print(row[0].value, row[1].value, row[2].value)
my_file = glob.glob('C:\\workspace\\smart4c_mip_all\\src\\smart4c\\modules\\**/*'+row[2].value, recursive=True)
my_file = Path(my_file[0])
with open(my_file, 'r',encoding='utf-8') as f:
data = f.read()
all = re.findall('getSqlFromXml\(\"(.*)\"', data)
#print(cell.value, all)
for sqlid in all:
print(row[0].value, row[1].value, row[2].value, sqlid)
ws2.append([row[0].value, row[1].value, row[2].value, sqlid])
#with open('C:\\Users\\lkh\\smtc\\smart4c_mip_all\\resource\\spring\\persists\\'+cell.value, 'r',encoding='utf-8') as f:
#data = f.read()
#doc = xmltodict.parse(data)
#newwb.save('쿼리조사.xlsx')
#with open('C:\\Users\\lkh\\smtc\\smart4c_mip_all\\src\\smart4c\\modules\\prd\\dao\\impl\\DlvInfoEntDaoImpl.java', 'r',encoding='utf-8') as f:
#data = f.read()
#print(data)
#all = re.findall(r"getSqlFromXml(\w+[\w\.]*)",data)
#all = re.findall('getSqlFromXml\(\"(.*)\"', data)
#print(all)
newwb.save('쿼리조사.xlsx')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import re
from pathlib import Path
import glob
import xmltodict
wb = load_workbook(filename = '쿼리조사.xlsx')
ws1 = wb["상품이외2"]
i=2
for row in ws1.iter_rows(min_col=1, min_row=2):
print(i,row[0].value, row[1].value, row[2].value, row[3].value)
queryfile = row[3].value.split('.')[0]
#my_file = Path('C:\\Users\\lkh\\smtc\\smart4c_mip_all\\src\\smart4c\\modules\\prd\\dao\\impl\\'+cell.value)
my_file = Path('C:\\workspace\\smart4c_mip_all\\resource\\spring\\query\\prd' + queryfile + '.xml')
if my_file.is_file():
with open(my_file,'r',encoding='utf-8') as f:
data = f.read()
doc = xmltodict.parse(data)
doc = doc['properties']['entry']
#print("lendoc",len(doc))
#print(isinstance(doc, (list, tuple)))
if isinstance(doc, (list, tuple)):
for query in doc:
if query['@key'] == row[3].value:
print(query['#text'])
#query.get('#text')
ws1['E'+str(i)] = query.get('#text')
else :
#print(doc)
if doc['@key'] == row[3].value:
print(doc['#text'])
ws1['E'+str(i)] = query.get('#text')
i+=1
#if my_file.is_file():
#print(i, cell.value,my_file)
#else :
#print(i, cell.value,"false")
#wb.save('smtcServer쿼리조사.xlsx')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
from pathlib import Path
import glob
newwb = load_workbook(filename = '쿼리조사.xlsx')
ws1 = newwb["상품이외2"]
#sharews = newwb["쉐어플렉스테이블"]
i=1
for row in ws1.iter_rows(min_col=1, min_row=2):
#print(i,row[4].value)
i+=1
#shtables=''
if row[5].value==None:
continue
if 'SELECT ' in row[5].value:
ws1['H'+str(i)] = 'Y'
if 'INSERT ' in row[5].value:
ws1['I'+str(i)] = 'Y'
if 'UPDATE ' in row[5].value:
ws1['J'+str(i)] = 'Y'
if 'DELETE ' in row[5].value:
ws1['K'+str(i)] = 'Y'
if '{0}' in row[5].value:
ws1['L'+str(i)] = 'Y'
if '/*+' in row[5].value:
ws1['M'+str(i)] = 'Y'
if '@' in row[5].value:
ws1['N'+str(i)] = 'Y'
if 'TB_BB002' in row[5].value:
ws1['N'+str(i)] = 'TB_BB002'
#for sharetable in sharews.iter_rows(min_col=1, min_row=2):
#if sharetable[0].value in row[4].value:
#shtables+=sharetable[0].value + ', '
#ws1['M'+str(i)] = shtables
newwb.save('쿼리조사.xlsx')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# python으로 .doc 파일 .docx 파일로 저장
from glob import glob
import re
import os
import win32com.client as win32
from win32com.client import constants
# Create list of paths to .doc files
paths = glob('C:\\Users\\lkh\\Documents\\01.편의점가맹지원\\프로그램설계서_doc\\*.doc', recursive=True)
def save_as_docx(path):
# Opening MS Word
word = win32.gencache.EnsureDispatch('Word.Application')
doc = word.Documents.Open(path)
doc.Activate ()
# Rename path with .docx
new_file_abs = os.path.abspath(path)
new_file_abs = re.sub(r'\.\w+$', '.docx', new_file_abs)
# Save and Close
word.ActiveDocument.SaveAs(
new_file_abs, FileFormat=constants.wdFormatXMLDocument
)
doc.Close(False)
count=0
for path in paths:
# print(path.replace("\\프로그램설계서\\","\\프로그램설계서_doc\\"))
save_as_docx(path)
# os.renames(path, path.replace("\\프로그램설계서\\","\\프로그램설계서_doc\\"))
count+=1
print(count)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# python으로 .ppt 파일 .pptx로 저장
https://stackoverflow.com/questions/45670024/convert-ppt-file-to-pptx-in-python (opens new window)
python3에서 안된다고 되어 있는데.. 아나콘다 에서는 잘 동작함
문서 5000개 정리하면서 사용했던 파이썬 소스
from glob import glob
import re
import os
import win32com.client
# Create list of paths to .doc files
paths = glob('C:\\Users\\lkh\\Documents\\01.편의점가맹지원\\보고서설계서\\*.ppt', recursive=True)
def save_as_pptx(path):
PptApp = win32com.client.Dispatch("Powerpoint.Application")
PptApp.Visible = True
PPtPresentation = PptApp.Presentations.Open(path)
PPtPresentation.SaveAs(path+'x', 24)
PPtPresentation.close()
PptApp.Quit()
for path in paths:
print(path.replace("\\보고서설계서\\", "\\보고서설계서_ppt\\"))
save_as_pptx(path)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# python으로 ppt 모든 글 읽기
prs = Presentation(file)
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_text_frame:
print(shape.text_frame.text)
for paragraph in shape.text_frame.paragraphs:
print(paragraph.text)
for idx in range(0,len(shape.text_frame.paragraphs)-1):
for ridx in range(0,len(shape.text_frame.paragraphs[idx].runs)):
print(shape.text_frame.paragraphs[idx].runs[ridx].text)
1
2
3
4
5
6
7
8
9
10
11
2
3
4
5
6
7
8
9
10
11
# python으로 ppt 테이블의 모든 셀 읽기
import os
from pptx import Presentation
import glob
import sys
path = "C:\\Users\\lkh\\Documents\\01.편의점가맹지원\\보고서설계서"
files = glob.glob(path + '/*')
for file in files:
prs = Presentation(file)
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_table:
for i in range(0,len(shape.table.rows)-1):
for j in range(0,len(shape.table.rows[i].cells)-1):
cell = shape.table.rows[i].cells[j].text
print(cell)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17