Spaces:
Sleeping
Sleeping
Upload pdf-modification-script.ipynb
Browse files
pdf-modification-script.ipynb
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.11.11","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":12363270,"sourceType":"datasetVersion","datasetId":7794961}],"dockerImageVersionId":31040,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install -q pypdf pymupdf","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true,"execution":{"iopub.status.busy":"2025-07-06T06:24:46.548518Z","iopub.execute_input":"2025-07-06T06:24:46.548911Z","iopub.status.idle":"2025-07-06T06:24:52.368756Z","shell.execute_reply.started":"2025-07-06T06:24:46.548880Z","shell.execute_reply":"2025-07-06T06:24:52.367814Z"}},"outputs":[{"name":"stdout","text":"\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m24.1/24.1 MB\u001b[0m \u001b[31m59.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n\u001b[?25h","output_type":"stream"}],"execution_count":7},{"cell_type":"code","source":"from pypdf import PdfReader, PdfWriter\nimport os\n\nbook_path = \"/kaggle/input/chatbot-vdb-docs/Lenny Delligatti - SysML Distilled A Brief Guide to the Systems Modeling Language-Addison-Wesley Professional (2013).pdf\"\nmodified_book_name = \"Lenny_Delligatti_modified.pdf\"\n\noutput_dir = \"/kaggle/working/modified_docs\"\n\nif not os.path.exists(output_dir):\n os.makedirs(output_dir)\n\ndef delete_pdf_pages(input_pdf_path, output_pdf_path, pages_to_delete):\n reader = PdfReader(input_pdf_path)\n writer = PdfWriter()\n\n total_pages = len(reader.pages)\n pages_to_delete_set = set(pages_to_delete)\n\n for i in range(total_pages):\n # Page numbers in pypdf are 0-indexed\n if i not in pages_to_delete_set:\n writer.add_page(reader.pages[i])\n\n with open(output_pdf_path, \"wb\") as out_file:\n writer.write(out_file)\n\n #print(f\"Deleted pages: {sorted(pages_to_delete)}\")\n #print(f\"Saved updated PDF to: {output_pdf_path}\")\n \n\n\nx = list(range(0,35))\ny = list(range(252,302))\nz = x + y\n\n# Say you want to delete pages 1 and 3 (1-indexed), pass as 0 and 2\ndelete_pdf_pages(\n input_pdf_path=book_path,\n output_pdf_path=f\"{output_dir}/{modified_book_name}\",\n pages_to_delete=z)\n\n\nprint(\"Done\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-07-06T06:22:36.731020Z","iopub.execute_input":"2025-07-06T06:22:36.731330Z","iopub.status.idle":"2025-07-06T06:22:37.892104Z","shell.execute_reply.started":"2025-07-06T06:22:36.731286Z","shell.execute_reply":"2025-07-06T06:22:37.891337Z"}},"outputs":[{"name":"stdout","text":"Done\n","output_type":"stream"}],"execution_count":4},{"cell_type":"code","source":"","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-07-06T06:23:13.390700Z","iopub.execute_input":"2025-07-06T06:23:13.391064Z","iopub.status.idle":"2025-07-06T06:23:13.396139Z","shell.execute_reply.started":"2025-07-06T06:23:13.391040Z","shell.execute_reply":"2025-07-06T06:23:13.395116Z"}},"outputs":[],"execution_count":5},{"cell_type":"code","source":"import fitz # PyMuPDF\n\nbook_path = \"/kaggle/input/chatbot-vdb-docs/The_SysML_Modelling_Language.pdf\"\nmodified_book_name = \"The_SysML_Modelling_Language_modified.pdf\"\n\ndef redact_area(pdf_path, output_path, page_num, rect_coords):\n doc = fitz.open(pdf_path)\n page = doc[page_num]\n\n # Draw a white rectangle over the specified area\n rect = fitz.Rect(rect_coords) # (x0, y0, x1, y1)\n page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))\n\n doc.save(output_path)\n print(f\"Saved redacted PDF to {output_path}\")\n\n\nredact_area(\n pdf_path=book_path,\n output_path=f\"{output_dir}/{modified_book_name}\",\n page_num=0, # First page\n rect_coords=(60, 50, 600, 200) # Approx. bounding box of the title area\n)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-07-06T06:26:46.113175Z","iopub.execute_input":"2025-07-06T06:26:46.113540Z","iopub.status.idle":"2025-07-06T06:26:46.133766Z","shell.execute_reply.started":"2025-07-06T06:26:46.113515Z","shell.execute_reply":"2025-07-06T06:26:46.132147Z"}},"outputs":[{"name":"stdout","text":"Saved redacted PDF to /kaggle/working/modified_docs/The_SysML_Modelling_Language_modified.pdf\n","output_type":"stream"}],"execution_count":10},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"from pypdf import PdfReader, PdfWriter\nimport os\n\nbook_path = \"/kaggle/input/chatbot-vdb-docs/OMG Systems Modeling Language (OMG SysML).pdf\"\nmodified_book_name = \"OMG Systems Modeling Language (OMG SysML)_modified.pdf\"\n\noutput_dir = \"/kaggle/working/modified_docs\"\n\nif not os.path.exists(output_dir):\n os.makedirs(output_dir)\n\ndef delete_pdf_pages(input_pdf_path, output_pdf_path, pages_to_delete):\n reader = PdfReader(input_pdf_path)\n writer = PdfWriter()\n\n total_pages = len(reader.pages)\n pages_to_delete_set = set(pages_to_delete)\n\n for i in range(total_pages):\n # Page numbers in pypdf are 0-indexed\n if i not in pages_to_delete_set:\n writer.add_page(reader.pages[i])\n\n with open(output_pdf_path, \"wb\") as out_file:\n writer.write(out_file)\n\n #print(f\"Deleted pages: {sorted(pages_to_delete)}\")\n #print(f\"Saved updated PDF to: {output_pdf_path}\")\n \n\n\nx = list(range(0,30))\nz = x\n\n# Say you want to delete pages 1 and 3 (1-indexed), pass as 0 and 2\ndelete_pdf_pages(\n input_pdf_path=book_path,\n output_pdf_path=f\"{output_dir}/{modified_book_name}\",\n pages_to_delete=z)\n\n\nprint(\"Done\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-07-06T06:38:27.478416Z","iopub.execute_input":"2025-07-06T06:38:27.478717Z","iopub.status.idle":"2025-07-06T06:38:28.174061Z","shell.execute_reply.started":"2025-07-06T06:38:27.478698Z","shell.execute_reply":"2025-07-06T06:38:28.173092Z"}},"outputs":[{"name":"stdout","text":"Done\n","output_type":"stream"}],"execution_count":12},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null}]}
|