use crate::types::Sizer;
use crate::{Chunk, ChunkError, File, FileChunker, byte_to_line_number, sizers::ByteSizer};

/// `SizeChunker` divides source code into chunks of approximately equal size while attempting to
/// break at reasonable boundaries (line breaks) to avoid splitting tokens or strings.
pub struct SizeChunker {
    sizer: Box<dyn Sizer>,
    chunk_overlap: usize,
}

impl SizeChunker {
    pub fn new(max_chunk_size: usize, chunk_overlap: usize) -> Result<Self, ChunkError> {
        if chunk_overlap >= max_chunk_size {
            return Err(ChunkError::InvalidOptions(
                "chunk overlap must be less than chunk size".to_string(),
            ));
        }

        Ok(Self::new_sizer(
            Box::new(ByteSizer::new(max_chunk_size)),
            chunk_overlap,
        ))
    }

    pub fn new_sizer(sizer: Box<dyn Sizer>, chunk_overlap: usize) -> Self {
        Self {
            sizer,
            chunk_overlap,
        }
    }

    /// Find the nearest character boundary at or before the given byte index
    fn find_char_boundary_before(&self, s: &str, mut byte_index: usize) -> usize {
        while byte_index > 0 && !s.is_char_boundary(byte_index) {
            byte_index -= 1;
        }
        byte_index
    }
}

impl FileChunker for SizeChunker {
    fn chunk_file<'a>(
        &self,
        target: &mut Vec<Chunk<'a>>,
        file: &File<'a>,
    ) -> Result<(), ChunkError> {
        let mut start_byte = 0;
        let total_length = file.source_code.len();

        while start_byte < total_length {
            let (mut end_byte, more) = self.sizer.find_end_byte(file.source_code, start_byte)?;

            if more
                && let Some(last_newline_post) = &file.source_code[start_byte..end_byte].rfind('\n')
            {
                end_byte = start_byte + last_newline_post + 1;
            }

            target.push(Chunk {
                file_path: file.file_path,
                start_byte,
                end_byte,
                start_line: byte_to_line_number(file.source_code, start_byte),
                content: &file.source_code[start_byte..end_byte],
                language: "",
            });

            if !more {
                break;
            }

            if self.chunk_overlap > 0 {
                let chunk_length = end_byte - start_byte;
                let overlap = self.chunk_overlap.min(chunk_length);
                start_byte = self.find_char_boundary_before(file.source_code, end_byte - overlap);
            } else {
                start_byte = end_byte;
            }
        }

        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    const SIZE_CHUNKER_TEST_CONTENT: &str = r#"This is a test content for chunking.
We will split this content into multiple chunks of a certain size.
We will use the simple size chunker.

This is not meant for testing with a parser,
since the parser is specifically for code content.

The simple chunker should split this content given a chunk size.
If the split does not occur at a new line, the simple chunker
will try to split at the last new line index.
"#;

    const SIZE_CHUNKER_TEST_CHUNK1: &str = r#"This is a test content for chunking.
We will split this content into multiple chunks of a certain size.
We will use the simple size chunker.

This is not meant for testing with a parser,
"#;

    const SIZE_CHUNKER_TEST_CHUNK2: &str = r#"since the parser is specifically for code content.

The simple chunker should split this content given a chunk size.
If the split does not occur at a new line, the simple chunker
"#;

    const SIZE_CHUNKER_TEST_CHUNK3: &str = r#"will try to split at the last new line index.
"#;

    #[test]
    fn test_new_with_too_large_chunk_overlap() {
        let result = SizeChunker::new(10, 10);

        assert!(result.is_err());
    }

    #[test]
    fn test_chunk_file() {
        let chunker = match SizeChunker::new(200, 0) {
            Ok(chunker) => chunker,
            Err(e) => panic!("unexpected error = {e}"),
        };

        let mut chunks = Vec::new();
        let result = chunker.chunk_file(
            &mut chunks,
            &File {
                source_code: SIZE_CHUNKER_TEST_CONTENT,
                file_path: "test-file.txt",
            },
        );

        assert!(result.is_ok());
        assert_eq!(3, chunks.len());

        let (chunk1, chunk2, chunk3) = (&chunks[0], &chunks[1], &chunks[2]);

        assert_eq!(SIZE_CHUNKER_TEST_CHUNK1, chunk1.content);
        assert_eq!(0, chunk1.start_byte);

        assert_eq!(SIZE_CHUNKER_TEST_CHUNK2, chunk2.content);
        assert_eq!(SIZE_CHUNKER_TEST_CHUNK1.len(), chunk2.start_byte);

        assert_eq!(SIZE_CHUNKER_TEST_CHUNK3, chunk3.content);
        assert_eq!(
            SIZE_CHUNKER_TEST_CHUNK1.len() + SIZE_CHUNKER_TEST_CHUNK2.len(),
            chunk3.start_byte
        );
    }

    #[test]
    fn test_chunk_file_with_overlap() {
        let source_code = "1234567890";
        let chunker = match SizeChunker::new(5, 1) {
            Ok(chunker) => chunker,
            Err(e) => panic!("unexpected error = {e}"),
        };

        let mut chunks = Vec::new();
        let result = chunker.chunk_file(
            &mut chunks,
            &File {
                source_code,
                file_path: "test-file.txt",
            },
        );

        assert!(result.is_ok());
        assert_eq!(3, chunks.len());

        let (chunk1, chunk2, chunk3) = (&chunks[0], &chunks[1], &chunks[2]);

        assert_eq!("12345", chunk1.content);
        assert_eq!(0, chunk1.start_byte);

        assert_eq!("56789", chunk2.content);
        assert_eq!(4, chunk2.start_byte);

        assert_eq!("90", chunk3.content);
        assert_eq!(8, chunk3.start_byte);
    }

    #[test]
    fn test_chunk_file_with_unicode() {
        let source_code = "Hello тест мир! This is a тест with Cyrillic тext.";
        let chunker = match SizeChunker::new(20, 0) {
            Ok(chunker) => chunker,
            Err(e) => panic!("unexpected error = {e}"),
        };

        let mut chunks = Vec::new();
        let result = chunker.chunk_file(
            &mut chunks,
            &File {
                source_code,
                file_path: "test-file.txt",
            },
        );

        assert!(result.is_ok());

        for chunk in &chunks {
            assert!(std::str::from_utf8(chunk.content.as_bytes()).is_ok());
        }
    }
}
