From 393370abf7a1029e3d2526d2a9691a160e0248ab Mon Sep 17 00:00:00 2001
From: Dominic DiTaranto <domdit@gmail.com>
Date: Sun, 28 Jul 2024 15:25:42 -0400
Subject: adding script

---
 chandl.py        | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |   7 ++++
 2 files changed, 110 insertions(+)
 create mode 100644 chandl.py
 create mode 100644 requirements.txt

diff --git a/chandl.py b/chandl.py
new file mode 100644
index 0000000..d963378
--- /dev/null
+++ b/chandl.py
@@ -0,0 +1,103 @@
+import argparse
+import os
+from pathlib import Path
+import pickle
+import shutil
+import sys
+
+import requests
+from bs4 import BeautifulSoup
+
+
+class Chandl:
+    def __init__(self, board, thread_id, output_path=None, quiet=False, delete_log=False):
+        self.board = board
+        self.thread_id = thread_id
+
+        self.output_path = output_path if output_path else f'data/{self.thread_id}/'
+        self.output_path_exists = os.path.exists(self.output_path)
+
+        self.quiet = quiet
+        if self.quiet:
+            sys.stdout = open(os.devnull, 'w')
+
+        self.log_path = f'logs/{thread_id}.pickle'
+
+        self.delete_log = delete_log
+        if self.delete_log:
+            os.remove(self.log_path)
+
+        self.thread_url = f'https://boards.4chan.org/{board}/thread/{thread_id}'
+        self.cdn_url = f'https://i.4cdn.org/{self.board}' + '/{file_name}'
+
+        self.downloaded_files = [] if not os.path.exists(self.log_path) else self.deserialize_log_file()
+
+        self.download_count = 0
+
+    def deserialize_log_file(self):
+        print('Reading log file...')
+        with open(self.log_path, 'rb') as f:
+            self.downloaded_files = pickle.load(f)
+        return self.downloaded_files
+
+    def serialize_log_file(self):
+        print('Saving log file...')
+        with open(self.log_path, 'wb') as f:
+            self.downloaded_files = pickle.dump(self.downloaded_files, f)
+
+    def main(self):
+        print(f'Logs currently contain {len(self.downloaded_files)} files...')
+
+        resp = self.retrieve_thread_data()
+
+        self.parse_thread_data_for_image_urls(resp.content)
+
+        print(f'Downloaded {self.download_count} files!')
+        print(f'Logs currently contain {len(self.downloaded_files)} files...')
+
+        self.serialize_log_file()
+
+    def retrieve_thread_data(self):
+        print('Retrieving thread data...')
+        return requests.get(self.thread_url)
+
+    def parse_thread_data_for_image_urls(self, thread_data):
+        print('Parsing thread data...')
+        soup = BeautifulSoup(thread_data, 'html.parser')
+        for file_div in soup.find_all('div', attrs={'class': 'fileText'}):
+            file_path = self.cdn_url.format(file_name=file_div.a["href"].split('/')[-1])
+            file_name = file_div.a.next_element
+
+            if file_path not in self.downloaded_files:
+                self.save_thread_image(file_path, file_name)
+                self.downloaded_files.append(file_path)
+                self.download_count += 1
+
+    def save_thread_image(self, file_path, file_name):
+        # only create output directory if there are files to be saved
+        if not self.output_path_exists:
+            print('Creating output path...')
+            Path(self.output_path).mkdir(parents=True)
+            self.output_path_exists = True
+
+        resp = requests.get(file_path, stream=True)
+
+        full_file_path = f"{self.output_path}/{file_name}"
+
+        print(f'Saving {file_name} to {full_file_path}...')
+        with open(full_file_path, 'wb') as f:
+            shutil.copyfileobj(resp.raw, f)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('board', type=str)
+    parser.add_argument('thread_id', type=str)
+    parser.add_argument('-o', '--output-path', dest='output_path', type=str, default=None)
+    parser.add_argument('-q', '--quiet', dest='quiet', action='store_true', default=False)
+    parser.add_argument('-d', '--delete-logs', dest='delete_logs', action='store_true', default=False)
+
+    args = parser.parse_args()
+
+    c = Chandl(args.board, args.thread_id, args.output_path, args.quiet, args.delete_logs)
+    c.main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..1c78b9c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+beautifulsoup4==4.12.3
+certifi==2024.7.4
+charset-normalizer==3.3.2
+idna==3.7
+requests==2.32.3
+soupsieve==2.5
+urllib3==2.2.2
-- 
cgit v1.2.3-70-g09d2