-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
233 lines (188 loc) · 8.45 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import youtube_dl, time, os, csv, random, subprocess, shutil
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from moviepy.editor import VideoFileClip
from datetime import datetime, timedelta
import pandas as pd
def download(url, name):
"""
Downloads a video from url and saves it in ./temp/ folder
as name.mkv
"""
ydl_opts = {'outtmpl': "./temp/" + name + ".mkv"}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
def today_or_yesterday(date):
"""
Determines if a date given is today or yesterday
Returns true if it is from today or yesterday
Returns false otherwise
"""
# Date today
now = datetime.now()
now_str = now.strftime("%b %d, %Y")
# Date yesterday
yesterday = datetime.now() - timedelta(days = 1)
yesterday_str = yesterday.strftime("%b %d, %Y")
if date == now_str or date == yesterday_str:
return True
return False
def get_length_video(video):
"""
Gets the length of video in seconds
"""
clip = VideoFileClip(video)
return clip.duration
def build_ffmpeg_input_str(list_of_clip_names):
"""
Create the string of inputs for use in ffmpeg
"""
final_str = ""
for clip in list_of_clip_names:
final_str += "-i {} ".format(clip)
return final_str
def convert_to_1080p(video_path):
"""
Upscales given video to 1080p using ffmpeg
"""
os.system("ffmpeg -i {} -vf scale=1920:-2 -c:v libx264 -preset slow -crf 22 -c:a copy {}".format(str(video_path), str(video_path[:len(str(video_path))-4] + "-upscaled.mkv")))
def scrape_clips():
"""
Scrapes URLs of top viewed clips in past 24 hours with > 50 viewers
from specified twitch channels
Creates clips_data.csv with streamer info, clip URL, and viewer count
"""
PREV_BROADCASTS = "/videos?filter=archives&sort=time"
TOP_CLIPS_24_HR = "/clips?filter=clips&range=24hr"
# Load in all channel URLs from sources.txt and process into a list
file = open("./sources.txt", "r")
list_of_channels = list(map(lambda x: x.strip("\n"), file.readlines()))
file.close()
# Deletes any existing clips_data.csv
try:
os.remove("./clips_data.csv")
except:
pass
# Begin process of scraping clip URLs and writing to clips_data.csv
with open("./clips_data.csv", "w", newline="") as file:
writer = csv.writer(file)
# Write header of clips_data.csv
writer.writerow(["Channel", "Clip Link", "Views"])
# Load up Google Chrome browser for scraping
driver = webdriver.Chrome("./chromedriver.exe")
# Begin scraping channels
for channel in list_of_channels:
driver.get(channel + PREV_BROADCASTS)
try:
# Waits until the most recent stream's preview is loaded and scrapes it's title for the date of their last stream
wait = WebDriverWait(driver, 10)
prev_stream_element = wait.until(expected_conditions.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/div[2]/div/main/div[2]/div[3]/div/div/div/div[1]/div[2]/div/div[3]/div/div/div/div/div[2]/div/div[1]/div[1]/article/div[2]/div/div/div[5]/a/div/div/div[1]/div[2]/div/div/div[2]/img")))
# Continue onto next channel if their last stream was not today or yesterday
if not today_or_yesterday(prev_stream_element.get_attribute("title")):
continue
except Exception as e:
# When someone is currently streaming, their previous streams are not shown
# But since they are streaming, we know that their most recent stream is today
# So we can just continue on
pass
driver.get(channel + TOP_CLIPS_24_HR)
list_of_clips = list()
# Waits until the clips section is loaded
try:
wait = WebDriverWait(driver, 10)
preview = wait.until(expected_conditions.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/div[2]/div/main/div[2]/div[3]/div/div/div/div[1]/div[2]/div/div[3]/div/div/div/div/div[2]/div/div/div[1]/div/div/div/div[1]")))
except:
continue
# Scrapes every single link on the page and adds to list_of_clips if it is a valid clip link
links = driver.find_elements_by_xpath("//a[@href]")
for link in links:
if "clip/" in link.get_attribute("href") and link.get_attribute("href") not in list_of_clips:
list_of_clips.append(link.get_attribute("href"))
# Scrapes the streamer, clip link, and viewer count from each clip
for clip in list_of_clips:
driver.get(clip)
# Waits until view count is loaded and scrapes it
try:
wait = WebDriverWait(driver, 10)
view_count = wait.until(expected_conditions.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/div[2]/div/main/div[2]/div[3]/div/div/div[1]/div[1]/div[2]/div/div[1]/div/div[1]/div[2]/div/div[3]/div/div[1]/div[2]")))
except:
try:
wait = WebDriverWait(driver, 10)
view_count = wait.until(expected_conditions.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/div[2]/div/main/div[2]/div[3]/div/div/div[1]/div[1]/div[2]/div/div[1]/div/div[1]/div[2]/div/div[3]/div/div[1]/div[2]")))
except:
continue
view_count = int(view_count.text.replace(",", ""))
if view_count < 50:
break
# Write to clips_data.csv
writer.writerow([channel, clip, view_count])
def download_clips(length_final_video=660):
"""
Reads all of the clips from clips_data.csv and downloads length_final_video seconds long
worth of clips from most viewed to least
"""
# Read clips_data.csv into pandas and sort by descending views
clips_df = pd.read_csv("./clips_data.csv")
clips_df = clips_df.sort_values(by=["Views"], ascending=False)
clips_links_sorted = list(clips_df["Clip Link"])
# Initialize counter for file names
counter = 0
# Initialize counter for total length of clips combined
length_clips = 0
# Deletes existing temp folder and recreates an empty one
try:
shutil.rmtree("./temp")
except:
pass
os.mkdir("./temp")
# Download each clip, name it chronologically, and check for length
for clip in clips_links_sorted:
try:
download(clip, str(counter))
except:
continue
length_clips += get_length_video("./temp/" + str(counter) + ".mkv")
counter += 1
if length_clips >= length_final_video:
break
def process_clips():
"""
Converts and merges all clips downloaded using ffmpeg
"""
# Gets all of the video names from the temp directory
downloaded_clips = os.listdir("./temp")
# Add path to all files
downloaded_clips = list(map(lambda file: "./temp/" + file, downloaded_clips))
# Initialize list for all of the paths of the final clips
final_clips = list()
# Checks if the resolution of all of the videos are the same
for clip in downloaded_clips:
video = VideoFileClip(clip)
# Convert to 1080p if not 1080p
if video.h != 1080 and video.w != 1920:
convert_to_1080p(clip)
final_clips.append(clip[:len(str(clip)) - 4] + "-upscaled.mkv")
else:
final_clips.append(clip)
random.shuffle(final_clips)
# Create command for merging clips
FFMPEG_STR = "ffmpeg {}-filter_complex \"[0:v] [0:a] [1:v] [1:a] [2:v] [2:a] concat=n={}:v=1:a=1 [v] [a]\" -map \"[v]\" -map \"[a]\" ./temp/combined.mkv".format(build_ffmpeg_input_str(final_clips), len(final_clips))
# Passes command through command prompt
os.system(FFMPEG_STR)
# Final command for converting final.mkv to final.mp4
os.system("ffmpeg -i ./temp/combined.mkv -codec copy ./final.mp4")
def run():
"""
Run the process of scraping, downloading, and processing
"""
scrape_clips()
download_clips()
x = input("Pausing for manual check. ")
process_clips()
if __name__ == "__main__":
run()
# scrape_clips()
# download_clips()
# process_clips()