Разбираем FB2 на главы | Snark68. Переводим с LLM

Разбираем файл fb2 на текстовые файлами с главами, для загрузки на rulate.

запускать:

python split_fb2.py  file_to_split.fb2

import re
import sys
import os
from pathlib import Path



def replace_chapter_number(chapter_filename, new_chapter_number):
    """
    Replaces the 3 or 4-digit chapter number in a filename with a new number.


    Args:
        chapter_filename: The filename string containing a 3 or 4-digit chapter number.
        new_chapter_number: The new chapter number (integer or string).
    Returns:
        The filename string with the replaced chapter number.
    """
    new_chapter_number_str = str(new_chapter_number).zfill(4)  # Ensure 4 digits, but will work with shorter replacements


    def replace_match(match):
        return match.group(1) + new_chapter_number_str[-len(match.group(2)):] + match.group(3) #Keeps the length of the original number


    return re.sub(r'(\D*)(\d{3,4})(\D*)', replace_match, chapter_filename)



import os
import zipfile
from bs4 import BeautifulSoup


def split_fb2_by_chapters(fb2_filepath, output_dir):
    """
    Splits an FB2 file into multiple text files, each containing a chapter.


    Args:
        fb2_filepath: Path to the FB2 file.
        output_dir: Directory to save the chapter files.
    """


    try:
        os.makedirs(output_dir, exist_ok=True)  # Create output directory if it doesn't exist


        if fb2_filepath.endswith(".zip"):  # Handle zipped FB2 (like .fb2.zip)
            with zipfile.ZipFile(fb2_filepath, 'r') as zip_ref:
                fb2_filename = [f for f in zip_ref.namelist() if f.endswith(".fb2")][0] #Find the fb2 file within the zip
                with zip_ref.open(fb2_filename) as f:
                    fb2_content = f.read()


        else: #Regular FB2 file
            with open(fb2_filepath, 'rb') as f:  # Open in binary mode to handle encoding
                fb2_content = f.read()



        soup = BeautifulSoup(fb2_content, 'lxml-xml')  # CORRECT: Specify the parser here


        i = 1
        for section in soup.find_all('section'):  # Iterate through all <section> tags (chapters)
            title_tag = section.find('title')
         #   print(i)
            if title_tag is not None and title_tag.p is not None: #Check for title tag and p tag within it
                chapter_title = title_tag.p.text.strip()  # Extract chapter title
            elif title_tag is not None: #If there is a title tag but no <p> tag
                chapter_title = title_tag.text.strip()
            else:
                chapter_title = "Untitled Chapter"  # Default title if no title tag is found


            # Sanitize filename (remove invalid characters)
            chapter_filename = "".join(x for x in chapter_title if x.isalnum() or x == " " or x == "_" or x == "-")
           
            #chapter_filename = replace_chapter_number(chapter_filename, 700+i)


            i += 1
            chapter_filename = chapter_filename[:200] #Limit the file name length
            chapter_filename = chapter_filename.replace(" ", " ") + ".txt"  # Replace spaces with underscores
            chapter_filepath = os.path.join(output_dir, chapter_filename)


            chapter_content = ""





            for p in section.find_all('p'): #Extract text from <p> tags within the chapter
              if p.text.strip().startswith("Глава")==0:
                chapter_content += '<p>'+p.text.strip() + '</p>'  + "\n"


                
            with open(chapter_filepath, 'w', encoding='utf-8') as outfile:  # Use utf-8 encoding
                outfile.write(chapter_content)


    except Exception as e:
        print(f"An error occurred: {e}")





def main():
    # Check if file name is provided as command line argument
    if len(sys.argv) < 2:
        print("Usage: python script.py <input_fb2_file>")
        print("Example: python script.py paint-vol-1-final-tl.fb2")
        sys.exit(1)


    # Get input file name from command line argument
    fb2_file = sys.argv[1]
    
    # Create output directory name based on input file name
    output_directory = Path(fb2_file).stem + "_output_chapters"
    
    try:
        # Validate input file exists
        if not os.path.isfile(fb2_file):
            raise FileNotFoundError(f"Input file '{fb2_file}' not found")
            
        # Create output directory if it doesn't exist
        Path(output_directory).mkdir(parents=True, exist_ok=True)
        
        # Process the file
        split_fb2_by_chapters(fb2_file, output_directory)
        
        print(f"FB2 file '{fb2_file}' split into chapters in '{output_directory}'")
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()