summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--chandl.py103
-rw-r--r--requirements.txt7
2 files changed, 110 insertions, 0 deletions
diff --git a/chandl.py b/chandl.py
new file mode 100644
index 0000000..d963378
--- /dev/null
+++ b/chandl.py
@@ -0,0 +1,103 @@
+import argparse
+import os
+from pathlib import Path
+import pickle
+import shutil
+import sys
+
+import requests
+from bs4 import BeautifulSoup
+
+
+class Chandl:
+ def __init__(self, board, thread_id, output_path=None, quiet=False, delete_log=False):
+ self.board = board
+ self.thread_id = thread_id
+
+ self.output_path = output_path if output_path else f'data/{self.thread_id}/'
+ self.output_path_exists = os.path.exists(self.output_path)
+
+ self.quiet = quiet
+ if self.quiet:
+ sys.stdout = open(os.devnull, 'w')
+
+ self.log_path = f'logs/{thread_id}.pickle'
+
+ self.delete_log = delete_log
+ if self.delete_log:
+ os.remove(self.log_path)
+
+ self.thread_url = f'https://boards.4chan.org/{board}/thread/{thread_id}'
+ self.cdn_url = f'https://i.4cdn.org/{self.board}' + '/{file_name}'
+
+ self.downloaded_files = [] if not os.path.exists(self.log_path) else self.deserialize_log_file()
+
+ self.download_count = 0
+
+ def deserialize_log_file(self):
+ print('Reading log file...')
+ with open(self.log_path, 'rb') as f:
+ self.downloaded_files = pickle.load(f)
+ return self.downloaded_files
+
+ def serialize_log_file(self):
+ print('Saving log file...')
+ with open(self.log_path, 'wb') as f:
+ self.downloaded_files = pickle.dump(self.downloaded_files, f)
+
+ def main(self):
+ print(f'Logs currently contain {len(self.downloaded_files)} files...')
+
+ resp = self.retrieve_thread_data()
+
+ self.parse_thread_data_for_image_urls(resp.content)
+
+ print(f'Downloaded {self.download_count} files!')
+ print(f'Logs currently contain {len(self.downloaded_files)} files...')
+
+ self.serialize_log_file()
+
+ def retrieve_thread_data(self):
+ print('Retrieving thread data...')
+ return requests.get(self.thread_url)
+
+ def parse_thread_data_for_image_urls(self, thread_data):
+ print('Parsing thread data...')
+ soup = BeautifulSoup(thread_data, 'html.parser')
+ for file_div in soup.find_all('div', attrs={'class': 'fileText'}):
+ file_path = self.cdn_url.format(file_name=file_div.a["href"].split('/')[-1])
+ file_name = file_div.a.next_element
+
+ if file_path not in self.downloaded_files:
+ self.save_thread_image(file_path, file_name)
+ self.downloaded_files.append(file_path)
+ self.download_count += 1
+
+ def save_thread_image(self, file_path, file_name):
+ # only create output directory if there are files to be saved
+ if not self.output_path_exists:
+ print('Creating output path...')
+ Path(self.output_path).mkdir(parents=True)
+ self.output_path_exists = True
+
+ resp = requests.get(file_path, stream=True)
+
+ full_file_path = f"{self.output_path}/{file_name}"
+
+ print(f'Saving {file_name} to {full_file_path}...')
+ with open(full_file_path, 'wb') as f:
+ shutil.copyfileobj(resp.raw, f)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('board', type=str)
+ parser.add_argument('thread_id', type=str)
+ parser.add_argument('-o', '--output-path', dest='output_path', type=str, default=None)
+ parser.add_argument('-q', '--quiet', dest='quiet', action='store_true', default=False)
+ parser.add_argument('-d', '--delete-logs', dest='delete_logs', action='store_true', default=False)
+
+ args = parser.parse_args()
+
+ c = Chandl(args.board, args.thread_id, args.output_path, args.quiet, args.delete_logs)
+ c.main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..1c78b9c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+beautifulsoup4==4.12.3
+certifi==2024.7.4
+charset-normalizer==3.3.2
+idna==3.7
+requests==2.32.3
+soupsieve==2.5
+urllib3==2.2.2