# 파이썬으로 대량 문서 작업 소스

올 한해는 문서만 작성하다가 끝났다....
그래도 sap 프로젝트때 파이썬으로 한번 문서 정리 하고 나니까 이번 탈오라클 프로젝트에서는 수월하게 작업했던거 같다.
좀 지저분하긴 한데 다음에 또 쓸일 있으면 정리 되겠지....
프로젝트 하면서 5000개 문서 분류 및 rename, 1000개 쿼리 엑셀로 정리한 파이썬 소스.
해당 소스는 anaconda3 jupyter notebook 에서 실행하였다.

python으로 마이바티스 쿼리 엑셀로 작성
python으로 .doc 파일 .docx 파일로 저장
python으로 .ppt 파일 .pptx로 저장
python으로 ppt 모든 글 읽기
python으로 ppt 테이블의 모든 셀 읽기

# python으로 마이바티스 쿼리 엑셀로 작성

xmltodict.parse(data) 이게 키포인트 였음.

#anaconda prompt> pip install xmltodict
import xmltodict
import json
from openpyxl import Workbook
from pathlib import Path
from openpyxl import load_workbook
wb = load_workbook(filename = '쿼리조사.xlsx')
smtc = wb['커넥션  조사']
#newwb = Workbook()
#newwb = load_workbook(filename = 'smtcServer쿼리조사.xlsx')
#ws1 = newwb.active
#ws1 = newwb['persist-impl']
#ws1.append(['번호','ecdDataSource','impl'])

i=1
for row in smtc.iter_rows(min_col=2, min_row=53 ,max_col=2 ,max_row=83):
    for cell in row:
        print(cell.value, end=" ")
        my_file = Path('C:\\workspace\\smart4c_mip_all\\resource\\spring\\persists\\'+cell.value)
        if my_file.is_file() == False:
            continue
        with open('C:\\workspace\\smart4c_mip_all\\resource\\spring\\persists\\'+cell.value, 'r',encoding='utf-8') as f:
            data = f.read()
        doc = xmltodict.parse(data)
        doc = doc['beans']['bean']
#print(json.dumps(doc, indent=4))
        if len(doc)==2 :
            for bean in doc:
                if '.incrementer.' not in bean['@class']:
#print(cell.value, ' ', bean['@class'])
                    classid = bean['@class']
                    print(i, cell.value, classid.split('.')[-1]+'.java')
                    smtc['C'+str(i)] = classid.split('.')[-1]+'.java'
#ws1.append([i,cell.value,classid.split('.')[-1]+'.java'])
        else :
#print(cell.value, ' ',doc['@class'])
            classid = doc['@class']
            print(i, cell.value, classid.split('.')[-1]+'.java')
            smtc['C'+str(i)] = classid.split('.')[-1]+'.java'
#ws1.append([i,cell.value,classid.split('.')[-1]+'.java'])
        i = i+1
#if 'impl' in bean['@class']:
#print(bean['@class'])
#print(doc)
#doc = doc['@class'].split('.')[-1]+'.java'
#print(doc)
#print(json.dumps(doc, indent=4))

#newwb.save('smtcServer쿼리조사.xlsx')
#newwb.save('smtcServer쿼리조사.xlsx')
#print(smtc['a1'].value)
#smtc['a1'].value = 'ttttttttttttttttttttt\ntttttttt\nttttttt\nttt'

wb.save('쿼리조사.xlsx')

#sheet['A1'] = 1
#sheet.cell(row=2, column=2).value = 2

#book.save('write2cell.xlsx')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59

import re
from pathlib import Path
import glob
from openpyxl import load_workbook
newwb = load_workbook(filename = '쿼리조사.xlsx')
ws1 = newwb['커넥션  조사']
ws2 = newwb.create_sheet("상품이외2")
ws2.append(['번호','ecdDataSource','impl','queryid'])

i=1
for row in ws1.iter_rows(min_col=1, min_row=53 ,max_col=4 ,max_row=53):
    print(row[0].value, row[1].value, row[2].value)

    my_file = glob.glob('C:\\workspace\\smart4c_mip_all\\src\\smart4c\\modules\\**/*'+row[2].value, recursive=True)
    my_file = Path(my_file[0])
    with open(my_file, 'r',encoding='utf-8') as f:
        data = f.read()
    all = re.findall('getSqlFromXml\(\"(.*)\"', data)
#print(cell.value, all)
    for sqlid in all:
        print(row[0].value, row[1].value, row[2].value, sqlid)
        ws2.append([row[0].value, row[1].value, row[2].value, sqlid])
#with open('C:\\Users\\lkh\\smtc\\smart4c_mip_all\\resource\\spring\\persists\\'+cell.value, 'r',encoding='utf-8') as f:
#data = f.read()
#doc = xmltodict.parse(data)
#newwb.save('쿼리조사.xlsx')
#with open('C:\\Users\\lkh\\smtc\\smart4c_mip_all\\src\\smart4c\\modules\\prd\\dao\\impl\\DlvInfoEntDaoImpl.java', 'r',encoding='utf-8') as f:
#data = f.read()
#print(data)
#all = re.findall(r"getSqlFromXml(\w+[\w\.]*)",data)
#all = re.findall('getSqlFromXml\(\"(.*)\"', data)
#print(all)
newwb.save('쿼리조사.xlsx')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

import re
from pathlib import Path
import glob
import xmltodict

wb = load_workbook(filename = '쿼리조사.xlsx')
ws1 = wb["상품이외2"]
i=2
for row in ws1.iter_rows(min_col=1, min_row=2):
    print(i,row[0].value, row[1].value, row[2].value, row[3].value)
    queryfile = row[3].value.split('.')[0]
#my_file = Path('C:\\Users\\lkh\\smtc\\smart4c_mip_all\\src\\smart4c\\modules\\prd\\dao\\impl\\'+cell.value)
    my_file = Path('C:\\workspace\\smart4c_mip_all\\resource\\spring\\query\\prd' + queryfile + '.xml')
    if my_file.is_file():
        with open(my_file,'r',encoding='utf-8') as f:
            data = f.read()
            doc = xmltodict.parse(data)
            doc = doc['properties']['entry']
#print("lendoc",len(doc))
#print(isinstance(doc, (list, tuple)))
            if isinstance(doc, (list, tuple)):
                for query in doc:
                    if query['@key'] == row[3].value:
                        print(query['#text'])
#query.get('#text')
                        ws1['E'+str(i)] = query.get('#text')
            else :
#print(doc)
                if doc['@key'] == row[3].value:
                        print(doc['#text'])
                        ws1['E'+str(i)] = query.get('#text')
    i+=1
#if my_file.is_file():
#print(i, cell.value,my_file)
#else :
#print(i, cell.value,"false")

#wb.save('smtcServer쿼리조사.xlsx')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

import re
from pathlib import Path
import glob

newwb = load_workbook(filename = '쿼리조사.xlsx')
ws1 = newwb["상품이외2"]
#sharews = newwb["쉐어플렉스테이블"]
i=1
for row in ws1.iter_rows(min_col=1, min_row=2):
#print(i,row[4].value)
    i+=1
#shtables=''
    if row[5].value==None:
        continue
    if 'SELECT ' in row[5].value:
        ws1['H'+str(i)] = 'Y'
    if 'INSERT ' in row[5].value:
        ws1['I'+str(i)] = 'Y'
    if 'UPDATE ' in row[5].value:
        ws1['J'+str(i)] = 'Y'
    if 'DELETE ' in row[5].value:
        ws1['K'+str(i)] = 'Y'
    if '{0}' in row[5].value:
        ws1['L'+str(i)] = 'Y'
    if '/*+' in row[5].value:
        ws1['M'+str(i)] = 'Y'
    if '@' in row[5].value:
        ws1['N'+str(i)] = 'Y'
    if 'TB_BB002' in row[5].value:
        ws1['N'+str(i)] = 'TB_BB002'
#for sharetable in sharews.iter_rows(min_col=1, min_row=2):
#if sharetable[0].value in row[4].value:
#shtables+=sharetable[0].value + ', '
#ws1['M'+str(i)] = shtables
newwb.save('쿼리조사.xlsx')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

# python으로 .doc 파일 .docx 파일로 저장

from glob import glob
import re
import os
import win32com.client as win32
from win32com.client import constants

# Create list of paths to .doc files
paths = glob('C:\\Users\\lkh\\Documents\\01.편의점가맹지원\\프로그램설계서_doc\\*.doc', recursive=True)

def save_as_docx(path):
    # Opening MS Word
    word = win32.gencache.EnsureDispatch('Word.Application')
    doc = word.Documents.Open(path)
    doc.Activate ()

    # Rename path with .docx
    new_file_abs = os.path.abspath(path)
    new_file_abs = re.sub(r'\.\w+$', '.docx', new_file_abs)

    # Save and Close
    word.ActiveDocument.SaveAs(
        new_file_abs, FileFormat=constants.wdFormatXMLDocument
    )
    doc.Close(False)

count=0
for path in paths:
#     print(path.replace("\\프로그램설계서\\","\\프로그램설계서_doc\\"))
    save_as_docx(path)
#     os.renames(path, path.replace("\\프로그램설계서\\","\\프로그램설계서_doc\\"))
    count+=1

print(count)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

# python으로 .ppt 파일 .pptx로 저장

https://stackoverflow.com/questions/45670024/convert-ppt-file-to-pptx-in-python (opens new window)

python3에서 안된다고 되어 있는데.. 아나콘다 에서는 잘 동작함

문서 5000개 정리하면서 사용했던 파이썬 소스

from glob import glob
import re
import os
import win32com.client

# Create list of paths to .doc files
paths = glob('C:\\Users\\lkh\\Documents\\01.편의점가맹지원\\보고서설계서\\*.ppt', recursive=True)

def save_as_pptx(path):
    PptApp = win32com.client.Dispatch("Powerpoint.Application")
    PptApp.Visible = True
    PPtPresentation = PptApp.Presentations.Open(path)
    PPtPresentation.SaveAs(path+'x', 24)
    PPtPresentation.close()
    PptApp.Quit()

for path in paths:
    print(path.replace("\\보고서설계서\\", "\\보고서설계서_ppt\\"))
    save_as_pptx(path)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

# python으로 ppt 모든 글 읽기

prs = Presentation(file)
for slide in prs.slides:
    for shape in slide.shapes:
        if shape.has_text_frame:
		    print(shape.text_frame.text)
    for paragraph in shape.text_frame.paragraphs:
        print(paragraph.text)

    for idx in range(0,len(shape.text_frame.paragraphs)-1):
        for ridx in range(0,len(shape.text_frame.paragraphs[idx].runs)):
            print(shape.text_frame.paragraphs[idx].runs[ridx].text)

1
2
3
4
5
6
7
8
9
10
11

# python으로 ppt 테이블의 모든 셀 읽기

import os
from pptx import Presentation
import glob
import sys

path = "C:\\Users\\lkh\\Documents\\01.편의점가맹지원\\보고서설계서"
files = glob.glob(path + '/*')

for file in files:
    prs = Presentation(file)
    for slide in prs.slides:
        for shape in slide.shapes:
            if shape.has_table:
                for i in range(0,len(shape.table.rows)-1):
                    for j in range(0,len(shape.table.rows[i].cells)-1):
                        cell = shape.table.rows[i].cells[j].text
                        print(cell)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

#python

← playwright DBeaver(디비버) Public Key Retrieval is not allowed 에러 →