用 Python 取得特定時間的檔案及檔案日期

2/9/2023 Python

前篇 用 Python 得出資料夾項下的各子資料夾 (opens new window) 介紹了如何用 os.walk 爬取指定路徑下的所有資料夾 & 檔案,並指定要爬取幾層資料夾

這篇繼續延伸,若是檔案為 office 格式 (.docx, xlsx, ppt) 的話,如何得到這些檔案的最後修改時間 & 修改人,並由此做判斷,將超過特定時間的檔案找出來

# 取得特定時間的檔案

第一段加亮處 (32-35 行) 設定要抓取哪個特定時間的檔案
第二段加亮處 (37-57 行) 設定要抓取特定格式的檔案
































 
 
 
 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
import os
import pathlib
from datetime import datetime
import pandas as pd
import zipfile
import xml.dom.minidom

os.system("cls")
desktopPath = str(pathlib.Path.home() / 'Desktop')

inputPath = input('Enter parse path: ')
src = inputPath

inputDays = input('Enter gap days: ')
inputDaysInt = int(inputDays)

today = datetime.now()

fileList = [] # store file name as a list
filePathList = [] # store file path as a list
ModifiedByList = [] # store the last saved person as a list
ModifiedOnList = [] # store file date modified as a list
count = 0 # calculating how many files are parsed

# Parse each file & folder in src
# folderPath (資料夾完整路徑), folderNameList (所有資料夾名稱,用 list 型態儲存), fileName (所有檔案名稱,用 list 型態儲存)
for folderPath, folderNameList, fileName in os.walk(src):
    # i = file name
    for i in fileName:
        filePath = folderPath + '\\' + i
        fileTime = os.path.getmtime(filePath) # get file date modified
        fileTimeDatetime = datetime.fromtimestamp(fileTime) # convert fileTime to datetime
        diffDays = (today - fileTimeDatetime).days # get the gap day between now & date modified
        if (diffDays > inputDaysInt): # get the file beyond how many days
            if '\~$' not in filePath: # ignore files path containing '\~$' (部分隱藏檔案的檔名會包含 ~$)
                count = count + 1 # calculating how many files are parsed
                split_tup = os.path.splitext(i) # 0 = file name, 1 = file extension
                file_extension = split_tup[1]
                if file_extension in ('.docx', '.xlsx', '.pptx'): # select files extensions are .docx', '.xlsx', '.pptx
                    document = zipfile.ZipFile(filePath) # Open the MS Office file to see the XML structure.

                    # Open/read the core.xml (contains the last user and modified date).
                    # Works on .docx, .xlsx, .pptx.
                    uglyXML = xml.dom.minidom.parseString(document.read('docProps/core.xml')).toprettyxml(indent='  ')
                    asText = uglyXML.splitlines() # Split lines in order to create a list.

                    # loop the list getting the value you need. In my case last Modified By and the date.
                    for item in asText:
                        if 'lastModifiedBy' in item:
                            itemLength = len(item)-20
                            fileOwner = str(item[21:itemLength])

                        if 'dcterms:modified' in item:
                            itemLength = len(item)-29
                            fileModified = item[46:itemLength]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

這樣即可取得特定時間 & 特定格式的檔案

但因非微軟 office 的檔案無法透過這個方式得到檔案的最後修改人 & 最後修改日期
故須用 else 來另外將非微軟 office 的檔案設值

最後再將這些 list 轉成 pandas dataframe 格式後,儲存成 Excel

# Non-office extention
                else:
                    fileOwner = '' # could not get the file owner if files are not MS office files
                    fileModified= fileTimeDatetime
                # put data to lists
                fileList.append(i)
                filePathList.append(filePath)
                ModifiedByList.append(fileOwner)
                ModifiedOnList.append(fileModified)

                # Convert list to dataframe
                data = {'File/Folder Name': fileList, 'Path': filePathList, 'File Modified by' : ModifiedByList, 'File Modified On' : ModifiedOnList}
                df = pd.DataFrame(data)


with pd.ExcelWriter(desktopPath  + r'\data.xlsx') as writer:
    df.to_excel(writer, sheet_name = "Data Frame")
    print("Saved to Excel at ", desktopPath  + r'\FileData.xlsx')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

# Source Code

import os
import pathlib
from datetime import datetime
import pandas as pd
import zipfile
import xml.dom.minidom

os.system("cls")
desktopPath = str(pathlib.Path.home() / 'Desktop')

inputPath = input('Enter parse path: ')
src = inputPath

inputDays = input('Enter gap days: ')
inputDaysInt = int(inputDays)

today = datetime.now()

fileList = [] # store file name as a list
filePathList = [] # store file path as a list
ModifiedByList = [] # store the last saved person as a list
ModifiedOnList = [] # store file date modified as a list
count = 0 # calculating how many files are parsed

# Parse each file & folder in src
# folderPath (資料夾完整路徑), folderNameList (所有資料夾名稱,用 list 型態儲存), fileName (所有檔案名稱,用 list 型態儲存)
for folderPath, folderNameList, fileName in os.walk(src):
    # i = file name
    for i in fileName:
        filePath = folderPath + '\\' + i
        fileTime = os.path.getmtime(filePath) # get file date modified
        fileTimeDatetime = datetime.fromtimestamp(fileTime) # convert fileTime to datetime
        diffDays = (today - fileTimeDatetime).days # get the gap day between now & date modified
        if (diffDays > inputDaysInt): # get the file beyond how many days
            if '\~$' not in filePath: # ignore files path containing '\~$' (部分隱藏檔案的檔名會包含 ~$)
                count = count + 1 # calculating how many files are parsed
                split_tup = os.path.splitext(i) # 0 = file name, 1 = file extension
                file_extension = split_tup[1]
                if file_extension in ('.docx', '.xlsx', '.pptx'): # select files extensions are .docx', '.xlsx', '.pptx
                    document = zipfile.ZipFile(filePath) # Open the MS Office file to see the XML structure.

                    # Open/read the core.xml (contains the last user and modified date).
                    # Works on .docx, .xlsx, .pptx.
                    uglyXML = xml.dom.minidom.parseString(document.read('docProps/core.xml')).toprettyxml(indent='  ')
                    asText = uglyXML.splitlines() # Split lines in order to create a list.

                    # loop the list getting the value you need. In my case last Modified By and the date.
                    for item in asText:
                        if 'lastModifiedBy' in item:
                            itemLength = len(item)-20
                            fileOwner = str(item[21:itemLength])

                        if 'dcterms:modified' in item:
                            itemLength = len(item)-29
                            fileModified = item[46:itemLength]

                # Non-office extention
                else:
                    fileOwner = '' # could not get the file owner if files are not MS office files
                    fileModified= fileTimeDatetime
                # put data to lists
                fileList.append(i)
                filePathList.append(filePath)
                ModifiedByList.append(fileOwner)
                ModifiedOnList.append(fileModified)

                # Convert list to dataframe
                data = {'File/Folder Name': fileList, 'Path': filePathList, 'File Modified by' : ModifiedByList, 'File Modified On' : ModifiedOnList}
                df = pd.DataFrame(data)


with pd.ExcelWriter(desktopPath  + r'\data.xlsx') as writer:
    df.to_excel(writer, sheet_name = "Data Frame")
    print("Saved to Excel at ", desktopPath  + r'\FileData.xlsx')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
Last Updated: 5/21/2023, 3:35:52 AM

歡迎點擊追蹤:

(adsbygoogle = window.adsbygoogle || []).push({});