-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcount_videos.py
197 lines (167 loc) · 8.27 KB
/
count_videos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import os
import io
import argparse
import pandas as pd
from google.oauth2.credentials import Credentials
from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from moviepy.editor import VideoFileClip
from concurrent.futures import ThreadPoolExecutor
import threading
class GoogleDriveDownloader:
def __init__(self, args):
self.args = args
self.babyview_drive_id = '0AJtfZGZvxvfxUk9PVA'
self.SCOPES = ['https://www.googleapis.com/auth/drive']
self.total_video_count = 0
self.video_durations = {} # initialize an empty dictionary to keep track of video durations
self.lock = threading.Lock() # lock for thread-safety
def load_existing_video_paths(self):
if os.path.exists(self.args.csv_path):
df = pd.read_csv(self.args.csv_path)
return set(df['File Path'])
return set()
def download_file(self, service, file_id, file_path):
# @TODO: some files with the exact same name will be different, can ask
# manual check to add a different name (-1, -2, -3)
# do not download already existed file..
if os.path.exists(file_path):
return
print(f"Downloading to: {file_path}")
# Uncomment the below for actual download
request = service.files().get_media(fileId=file_id)
fh = io.FileIO(file_path, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while not done:
status, done = downloader.next_chunk()
print(f"Download {int(status.progress() * 100)}% complete.")
with self.lock:
self.total_video_count += 1
def get_video_duration(self, file_path):
with VideoFileClip(file_path) as clip:
duration = clip.duration
with self.lock:
self.video_durations[file_path] = duration
#print(f"Video duration: {duration} seconds")
def download_and_get_duration(self, service, file_id, file_path, existing_paths):
relative_path = file_path.replace(self.args.video_root, '') # Get the relative path
if relative_path in existing_paths:
print(f"Skipping already existing video: {file_path}")
return
try:
self.download_file(service, file_id, file_path)
except Exception as e:
print(f">>>>>>>>>>>>>>>>>>>>>> {file_path} failed to download..")
print("Exception is", e)
try:
self.get_video_duration(file_path)
except Exception as e:
print("Exception is:", e)
def get_existing_video_durations(self, root_path):
for dirpath, _, filenames in os.walk(root_path):
for file in filenames:
if file.endswith('.MP4'):
file_path = os.path.join(dirpath, file)
try:
self.get_video_duration(file_path)
except Exception as e:
print(f"Error getting duration for {file_path}. Exception: {e}")
try:
os.remove(file_path)
print(f"Deleted {file_path} due to error.")
except Exception as delete_error:
print(f"Error deleting {file_path}. Exception: {delete_error}")
#def recursive_search_and_download(self, service, folder_id, local_path):
def recursive_search_and_download(self, service, folder_id, local_path, existing_paths):
if not os.path.exists(local_path):
os.makedirs(local_path)
page_token = None
while True:
results = service.files().list(
driveId=self.babyview_drive_id,
corpora='drive',
q=f"'{folder_id}' in parents and trashed = false", # exclude trashed items
pageSize=1000,
fields="nextPageToken, files(id, name, mimeType)",
includeItemsFromAllDrives=True,
supportsAllDrives=True,
pageToken=page_token
).execute()
items = results.get('files', [])
with ThreadPoolExecutor(max_workers=self.args.max_workers) as executor:
futures = []
for item in items:
if item['mimeType'] == 'application/vnd.google-apps.folder':
self.recursive_search_and_download(service, item['id'], os.path.join(local_path, item['name']), existing_paths)
elif item['name'].endswith('.MP4'):
future = executor.submit(self.download_and_get_duration, service, item['id'], os.path.join(local_path, item['name']), existing_paths)
futures.append(future)
for future in futures:
future.result() # This will raise any exceptions encountered
page_token = results.get('nextPageToken', None)
if page_token is None:
break
def download_videos_from_drive(self):
creds = None
token_path = os.path.join(self.args.cred_folder, 'token.json')
if os.path.exists(token_path):
creds = Credentials.from_authorized_user_file(token_path, self.SCOPES)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
cred_path = os.path.join(self.args.cred_folder, 'credentials.json')
flow = InstalledAppFlow.from_client_secrets_file(cred_path, self.SCOPES)
creds = flow.run_local_server(port=self.args.port)
with open(token_path, 'w') as token:
token.write(creds.to_json())
service = build('drive', 'v3', credentials=creds)
existing_paths = self.load_existing_video_paths()
# recursive search and download, skipping videos with paths already in the CSV
self.recursive_search_and_download(service, self.babyview_drive_id, self.args.video_root, existing_paths)
def save_to_csv(self):
csv_path = self.args.csv_path
# remove video_root prefix from file paths
cleaned_paths = [(path.replace(self.args.video_root, ''), duration) for path, duration in self.video_durations.items()]
new_data = pd.DataFrame(cleaned_paths, columns=['File Path', 'Duration (s)'])
# if CSV exists, append new data to it
if os.path.exists(csv_path):
existing_data = pd.read_csv(csv_path)
combined_data = pd.concat([existing_data, new_data], ignore_index=True)
combined_data.drop_duplicates(subset='File Path', keep='last', inplace=True)
combined_data.to_csv(csv_path, index=False)
else:
new_data.to_csv(csv_path, index=False)
def seconds_to_hms(self, seconds):
""" Convert seconds to hh:mm:ss format
"""
hours = seconds // 3600
seconds %= 3600
minutes = seconds // 60
seconds %= 60
return hours, minutes, seconds
def print_video_stats(self):
total_duration = sum(self.video_durations.values())
total_videos = len(self.video_durations)
hours, minutes, secs = self.seconds_to_hms(total_duration)
print(f"Total Number of Videos: {total_videos}")
print(f"Total Duration of Videos: {hours} hours {minutes} mins {secs:.2f} secs")
def main():
video_root = "/data/ziyxiang/BabyView/"
cred_folder = "/ccn2/u/ziyxiang/cloud_credentials/babyview"
parser = argparse.ArgumentParser(description="Download videos from cloud services")
parser.add_argument('--video_root', type=str, default=video_root)
parser.add_argument('--csv_path', type=str, default='video_durations.csv')
parser.add_argument('--cred_folder', type=str, default=cred_folder)
parser.add_argument('--max_workers', type=int, default=1)
args = parser.parse_args()
downloader = GoogleDriveDownloader(args)
#downloader.get_existing_video_durations(args.video_root)
downloader.download_videos_from_drive()
downloader.save_to_csv()
downloader.print_video_stats()
if __name__ == '__main__':
main()