File size: 5,482 Bytes
946a274
 
0241b66
d7d471b
a40cc94
 
d7d471b
 
 
 
946a274
 
 
d7d471b
30c739d
 
d7d471b
 
 
 
 
 
 
 
 
 
 
 
 
946a274
0241b66
 
30c739d
946a274
af926fe
d7d471b
946a274
 
23c71d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0241b66
946a274
d7d471b
 
 
23c71d8
 
a40cc94
0241b66
 
 
 
 
 
 
 
 
 
 
d7d471b
af926fe
d7d471b
 
 
 
6f874f7
d55b380
 
d7d471b
6f874f7
0241b66
 
 
 
 
 
 
 
 
 
 
 
 
 
af926fe
6f874f7
af926fe
 
6f874f7
d7d471b
 
 
 
0241b66
946a274
d7d471b
 
 
946a274
 
 
6f874f7
946a274
 
 
 
 
 
 
 
 
 
0f4b7f9
946a274
 
39bca12
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr
from gradio_client import Client
from huggingface_hub import HfApi
import logging
import time
import os

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Function to call the API and get the result
def call_api(prompt):
    try:
        # Reload the Gradio client for each chunk
        client = Client("MiniMaxAI/MiniMax-Text-01")
        logger.info(f"Calling API with prompt: {prompt[:100]}...")  # Log the first 100 chars of the prompt
        result = client.predict(
            message=prompt,
            max_tokens=12800,
            temperature=0.1,
            top_p=0.9,
            api_name="/chat"
        )
        logger.info("API call successful.")
        return result
    except Exception as e:
        logger.error(f"API call failed: {e}")
        raise gr.Error(f"API call failed: {str(e)}")

# Function to segment the text into chunks of 1500 words
def segment_text(text):
    # Split the text into chunks of 1500 words
    words = text.split()
    chunks = [" ".join(words[i:i + 1500]) for i in range(0, len(words), 1250)]
    logger.info(f"Segmented text into {len(chunks)} chunks.")
    return chunks

# Function to read file content with fallback encoding
def read_file_content(file):
    try:
        # Try reading with UTF-8 encoding first
        if hasattr(file, "read"):
            content = file.read().decode('utf-8')
        else:
            content = file.decode('utf-8')
        logger.info("File read successfully with UTF-8 encoding.")
        return content
    except UnicodeDecodeError:
        # Fallback to latin-1 encoding if UTF-8 fails
        logger.warning("UTF-8 encoding failed. Trying latin-1 encoding.")
        if hasattr(file, "read"):
            file.seek(0)  # Reset file pointer to the beginning
            content = file.read().decode('latin-1')
        else:
            content = file.decode('latin-1')
        logger.info("File read successfully with latin-1 encoding.")
        return content
    except Exception as e:
        logger.error(f"Failed to read file: {e}")
        raise gr.Error(f"Failed to read file: {str(e)}")

# Function to process the text and make API calls with rate limiting
def process_text(file, prompt):
    try:
        logger.info("Starting text processing...")
        
        # Read the file content with fallback encoding
        text = read_file_content(file)
        logger.info(f"Text length: {len(text)} characters.")
        
        # Segment the text into chunks
        chunks = segment_text(text)
        
        # Initialize Hugging Face API
        hf_api = HfApi(token=os.environ.get("HUGGINGFACE_TOKEN"))
        if not hf_api.token:
            raise ValueError("Hugging Face token not found in environment variables.")
        
        # Repository name on Hugging Face Hub
        repo_name = "TeacherPuffy/book2"
        
        # Process each chunk with a 15-second delay between API calls
        results = []
        for idx, chunk in enumerate(chunks):
            logger.info(f"Processing chunk {idx + 1}/{len(chunks)}")
            try:
                # Call the API
                result = call_api(f"{prompt}\n\n{chunk}")
                results.append(result)
                logger.info(f"Chunk {idx + 1} processed successfully.")
                
                # Upload the chunk directly to Hugging Face
                try:
                    logger.info(f"Uploading chunk {idx + 1} to Hugging Face...")
                    hf_api.upload_file(
                        path_or_fileobj=result.encode('utf-8'),  # Convert result to bytes
                        path_in_repo=f"output_{idx}.txt",  # File name in the repository
                        repo_id=repo_name,
                        repo_type="dataset",
                    )
                    logger.info(f"Chunk {idx + 1} uploaded to Hugging Face successfully.")
                except Exception as e:
                    logger.error(f"Failed to upload chunk {idx + 1} to Hugging Face: {e}")
                    raise gr.Error(f"Failed to upload chunk {idx + 1} to Hugging Face: {str(e)}")
                
                # Wait 15 seconds before the next API call
                if idx < len(chunks) - 1:  # No need to wait after the last chunk
                    logger.info("Waiting 15 seconds before the next API call...")
                    time.sleep(15)
                
            except Exception as e:
                logger.error(f"Failed to process chunk {idx + 1}: {e}")
                raise gr.Error(f"Failed to process chunk {idx + 1}: {str(e)}")
        
        return "All chunks processed and uploaded to Hugging Face."
    
    except Exception as e:
        logger.error(f"An error occurred during processing: {e}")
        raise gr.Error(f"An error occurred: {str(e)}")

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Text File Processor with Rate-Limited API Calls")
    with gr.Row():
        file_input = gr.File(label="Upload Text File")
        prompt_input = gr.Textbox(label="Enter Prompt")
    with gr.Row():
        output_message = gr.Textbox(label="Status Message")
    submit_button = gr.Button("Submit")
    
    submit_button.click(
        process_text,
        inputs=[file_input, prompt_input],
        outputs=[output_message]
    )

# Launch the Gradio app with a public link
demo.launch(share=True)