TheoLvs commited on
Commit
961cc08
·
1 Parent(s): 7c9a1bf

Updated submission

Browse files
Files changed (4) hide show
  1. .gitignore +16 -1
  2. app.py +208 -60
  3. modelcard.md +61 -0
  4. requirements.txt +4 -1
.gitignore CHANGED
@@ -1,2 +1,17 @@
1
-
2
  .ipynb_checkpoints/sandbox-checkpoint.ipynb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  .ipynb_checkpoints/sandbox-checkpoint.ipynb
2
+
3
+ auto_evals/
4
+ venv/
5
+ __pycache__/
6
+ .env
7
+ .ipynb_checkpoints
8
+ *ipynb
9
+ .vscode/
10
+
11
+ eval-queue/
12
+ eval-results/
13
+ eval-queue-bk/
14
+ eval-results-bk/
15
+ logs/
16
+
17
+ emissions.csv
app.py CHANGED
@@ -1,76 +1,224 @@
1
  import gradio as gr
2
- import spaces
3
  from codecarbon import EmissionsTracker
4
-
5
- # Import necessary libraries
6
- from sklearn.model_selection import train_test_split
7
- from sklearn.ensemble import RandomForestClassifier
8
- from sklearn.metrics import classification_report, accuracy_score
9
- import pandas as pd
10
  import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- # Let's create a sample dataset (you can replace this with your own data)
13
- def create_sample_data():
14
- np.random.seed(42)
15
- n_samples = 10000
 
 
 
 
 
 
 
 
 
16
 
17
- # Create features (X)
18
- X = np.random.randn(n_samples, 4) # 4 features
 
19
 
20
- # Create target (y) - binary classification
21
- y = (X[:, 0] + X[:, 1] + X[:, 2] > 0).astype(int)
22
 
23
- return X, y
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- # Get data (replace this with your data loading code)
26
- X, y = create_sample_data()
27
- tracker = EmissionsTracker()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- @spaces.GPU
30
- def submit(username):
 
 
 
 
 
 
 
 
 
31
 
32
- tracker.start()
 
 
 
 
 
 
 
 
 
 
33
 
34
- tracker.start_task("train_model")
35
- # Split the data into training and testing sets
36
- X_train, X_test, y_train, y_test = train_test_split(
37
- X, y, test_size=0.2, random_state=42
38
- )
39
-
40
- # Initialize the model
41
- rf_model = RandomForestClassifier(
42
- n_estimators=1000,
43
- max_depth=5,
44
- random_state=42
45
- )
46
 
47
- # Train the model
48
- print("Training the model...")
49
- rf_model.fit(X_train, y_train)
50
 
51
- training_emissions = tracker.stop_task()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- tracker.start_task("inference")
54
- rf_model.predict(X_test)
55
- inference_emissions = tracker.stop_task()
56
-
57
- emissions = inference_emissions.emissions
58
- energy = inference_emissions.energy_consumed
59
-
60
- return [emissions, energy]
61
-
62
- # Update the interface configuration
63
- demo = gr.Interface(
64
- fn=submit,
65
- inputs=gr.Textbox(label="Username"),
66
- outputs=[
67
- gr.Number(label="Emissions (kgCO2eq)", precision=6),
68
- gr.Number(label="Energy Consumed (kWh)", precision=6)
69
- ],
70
- title="Carbon Emissions Tracker",
71
- description="Track the carbon emissions and energy consumption of model training and inference."
72
- )
73
-
74
- # Launch the Gradio interface
75
  if __name__ == "__main__":
76
  demo.launch()
 
1
  import gradio as gr
 
2
  from codecarbon import EmissionsTracker
3
+ from datasets import load_dataset
 
 
 
 
 
4
  import numpy as np
5
+ from sklearn.metrics import accuracy_score
6
+ import random
7
+ import os
8
+ import json
9
+ from datetime import datetime
10
+ from huggingface_hub import HfApi
11
+ from huggingface_hub import upload_file
12
+ import tempfile
13
+ from dotenv import load_dotenv
14
+
15
+ # Use dotenv to load the environment variables
16
+ load_dotenv()
17
+
18
+ # Get HF token from environment variable
19
+ HF_TOKEN = os.getenv("HF_TOKEN_TEXT")
20
+ print(HF_TOKEN)
21
+ if not HF_TOKEN:
22
+ print("Warning: HF_TOKEN not found in environment variables. Submissions will not work.")
23
+
24
+ tracker = EmissionsTracker(allow_multiple_runs=True)
25
+
26
+ # Function to get space username and URL
27
+ def get_space_info():
28
+ space_name = os.getenv("SPACE_ID", "")
29
+ if space_name:
30
+ try:
31
+ username = space_name.split("/")[0]
32
+ space_url = f"https://huggingface.co/spaces/{space_name}"
33
+ return username, space_url
34
+ except Exception as e:
35
+ print(f"Error getting space info: {e}")
36
+ return "local-user", "local-development"
37
 
38
+ def clean_emissions_data(emissions_data):
39
+ """Remove unwanted fields from emissions data"""
40
+ data_dict = emissions_data.__dict__
41
+ fields_to_remove = ['timestamp', 'project_name', 'experiment_id', 'latitude', 'longitude']
42
+ return {k: v for k, v in data_dict.items() if k not in fields_to_remove}
43
+
44
+ def evaluate():
45
+ # Get space info
46
+ username, space_url = get_space_info()
47
+
48
+ # Initialize tracker
49
+ tracker.start()
50
+ tracker.start_task("inference")
51
 
52
+ # Make random predictions
53
+ true_labels = test_dataset["label"]
54
+ predictions = [random.randint(0, 7) for _ in range(len(true_labels))]
55
 
56
+ # Calculate accuracy
57
+ accuracy = accuracy_score(true_labels, predictions)
58
 
59
+ # Stop tracking emissions
60
+ emissions_data = tracker.stop_task()
61
+
62
+ # Prepare complete results
63
+ results = {
64
+ "username": username,
65
+ "space_url": space_url,
66
+ "submission_timestamp": datetime.now().isoformat(),
67
+ "accuracy": float(accuracy),
68
+ "energy_consumed_wh": emissions_data.energy_consumed * 1000,
69
+ "emissions_gco2eq": emissions_data.emissions * 1000,
70
+ "emissions_data": clean_emissions_data(emissions_data)
71
+ }
72
+
73
+ # Return both summary and detailed results
74
+ return [
75
+ accuracy,
76
+ emissions_data.emissions * 1000,
77
+ emissions_data.energy_consumed * 1000,
78
+ json.dumps(results, indent=2)
79
+ ]
80
 
81
+ def submit_results(results_json):
82
+ if not results_json:
83
+ return gr.Warning("No results to submit")
84
+
85
+ if not HF_TOKEN:
86
+ return gr.Warning("HF_TOKEN not found. Please set up your Hugging Face token.")
87
+
88
+ # try:
89
+ # results_json is already a string, no need to load it
90
+ results_str = json.dumps(results_json) # Parse the JSON string to get the data
91
+
92
+ # Create a temporary file with the results
93
+ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f:
94
+ # Write the original JSON string to file
95
+ f.write(results_str)
96
+ temp_path = f.name
97
+
98
+ # Upload to the dataset
99
+ api = HfApi(token=HF_TOKEN)
100
+ path_in_repo = f"submissions/{results_json['username']}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
101
+ api.upload_file(
102
+ path_or_fileobj=temp_path,
103
+ path_in_repo=path_in_repo,
104
+ repo_id="frugal-ai-challenge/public-leaderboard-text",
105
+ repo_type="dataset",
106
+ token=HF_TOKEN
107
+ )
108
+
109
+ # Clean up
110
+ os.unlink(temp_path)
111
+
112
+ return gr.Info("Results submitted successfully to the leaderboard! 🎉")
113
+ # except Exception as e:
114
+ # return gr.Warning(f"Error submitting results: {str(e)}")
115
 
116
+ # Define the label mapping
117
+ LABEL_MAPPING = {
118
+ "0_not_relevant": 0, # No relevant claim detected
119
+ "1_not_happening": 1, # Global warming is not happening
120
+ "2_not_human": 2, # Not caused by humans
121
+ "3_not_bad": 3, # Not bad or beneficial
122
+ "4_solutions_harmful_unnecessary": 4, # Solutions harmful/unnecessary
123
+ "5_science_unreliable": 5, # Science is unreliable
124
+ "6_proponents_biased": 6, # Proponents are biased
125
+ "7_fossil_fuels_needed": 7 # Fossil fuels are needed
126
+ }
127
 
128
+ # Reverse mapping for display purposes
129
+ LABEL_DESCRIPTIONS = {
130
+ 0: "No relevant claim detected",
131
+ 1: "Global warming is not happening",
132
+ 2: "Not caused by humans",
133
+ 3: "Not bad or beneficial",
134
+ 4: "Solutions harmful/unnecessary",
135
+ 5: "Science is unreliable",
136
+ 6: "Proponents are biased",
137
+ 7: "Fossil fuels are needed"
138
+ }
139
 
140
+ # Load and prepare the dataset
141
+ print("Loading dataset...")
142
+ dataset = load_dataset("QuotaClimat/frugalaichallenge-text-train")
 
 
 
 
 
 
 
 
 
143
 
144
+ # Convert string labels to integers
145
+ dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
 
146
 
147
+ # Split dataset
148
+ train_test = dataset["train"].train_test_split(test_size=0.2, seed=42)
149
+ train_dataset = train_test["train"]
150
+ test_dataset = train_test["test"]
151
+
152
+ # Display preview
153
+ print("\nFirst 5 rows of test set:")
154
+ for i, example in enumerate(test_dataset.select(range(5))):
155
+ print(f"\nExample {i+1}:")
156
+ print(f"Text: {example['quote'][:100]}...")
157
+ print(f"Label: {example['label']} - {LABEL_DESCRIPTIONS[example['label']]}")
158
+
159
+ # Create the demo interface
160
+ with gr.Blocks() as demo:
161
+
162
+
163
+ gr.Markdown("""
164
+ # Frugal AI Challenge - Text task - Submission portal
165
+ ## Climate Disinformation Classification
166
+ """)
167
+
168
+ with gr.Tabs():
169
+
170
+ with gr.Tab("Instructions"):
171
+
172
+ gr.Markdown("""
173
+ To submit your results, please follow the steps below:
174
+
175
+ ## Prepare your model submission
176
+ 1. Clone the space of this portal on your own Hugging Face account.
177
+ 2. Modify the ``evaluate`` function to replace the baseline by your model loading and inference within the inference pass where the energy consumption and emissions are tracked.
178
+ 3. Eventually complete the requirements and/or any necessaries dependencies in your space.
179
+ 4. Write down your model card in the ``modelcard.md`` file.
180
+ 5. Deploy your space and verify that it works.
181
+ 6. (Optional) You can change the Space hardware to use any GPU directly on Hugging Face.
182
+
183
+ ## Submit your model to the leaderboard in the ``Model Submission`` tab
184
+ 7. Step 1 - Evaluate model: Click on the button to evaluate your model. This will run you model, computes the accuracy on the test set (20% of the train set), and track the energy consumption and emissions.
185
+ 8. Step 2 - Submit to leaderboard: Click on the button to submit your results to the leaderboard. This will upload the results to the leaderboard dataset and update the leaderboard.
186
+ 9. You can see the leaderboard at https://huggingface.co/datasets/frugal-ai-challenge/public-leaderboard-text
187
+ """)
188
+
189
+ with gr.Tab("Model Submission"):
190
+ gr.Markdown("## Random Baseline Model")
191
+
192
+ with gr.Row():
193
+ with gr.Column(scale=1):
194
+ evaluate_btn = gr.Button("1. Evaluate model", variant="secondary")
195
+ with gr.Column(scale=1):
196
+ submit_btn = gr.Button("2. Submit to leaderboard", variant="primary", size="lg")
197
+
198
+ with gr.Row():
199
+ accuracy_output = gr.Number(label="Accuracy", precision=4)
200
+ emissions_output = gr.Number(label="Emissions (gCO2eq)", precision=12)
201
+ energy_output = gr.Number(label="Energy Consumed (Wh)", precision=12)
202
+
203
+ with gr.Row():
204
+ results_json = gr.JSON(label="Detailed Results", visible=True)
205
+
206
+ evaluate_btn.click(
207
+ evaluate,
208
+ inputs=None,
209
+ outputs=[accuracy_output, emissions_output, energy_output, results_json]
210
+ )
211
+
212
+ submit_btn.click(
213
+ submit_results,
214
+ inputs=[results_json],
215
+ outputs=None # No need for output component with popups
216
+ )
217
+
218
+ with gr.Tab("Model Card"):
219
+ with open("modelcard.md", "r") as f:
220
+ model_card_content = f.read()
221
+ gr.Markdown(model_card_content)
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  if __name__ == "__main__":
224
  demo.launch()
modelcard.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Random Baseline Model Card
2
+
3
+ ## Model Description
4
+
5
+ **Model Type:** Random Baseline Classifier
6
+ **Task:** Climate Change Disinformation Classification
7
+ **Version:** 1.0.0
8
+ **Last Updated:** 2024
9
+
10
+ ### Overview
11
+ This is a random baseline model for climate change disinformation classification. It randomly assigns labels to text inputs, serving as a baseline for comparing more sophisticated models.
12
+
13
+ ### Intended Use
14
+ - **Primary Use:** Baseline comparison for climate disinformation classification models
15
+ - **Intended Users:** Researchers and developers working on climate disinformation detection
16
+ - **Out-of-Scope Uses:** Not intended for production or real-world classification tasks
17
+
18
+ ## Training Data
19
+
20
+ **Dataset:** QuotaClimat/frugalaichallenge-text-train
21
+ - Size: ~6000 examples
22
+ - Split: 80% train, 20% test
23
+ - Labels: 8 categories of climate disinformation claims
24
+
25
+ ### Labels
26
+ 0. No relevant claim detected
27
+ 1. Global warming is not happening
28
+ 2. Not caused by humans
29
+ 3. Not bad or beneficial
30
+ 4. Solutions harmful/unnecessary
31
+ 5. Science is unreliable
32
+ 6. Proponents are biased
33
+ 7. Fossil fuels are needed
34
+
35
+ ## Performance
36
+
37
+ ### Metrics
38
+ - **Accuracy:** ~12.5% (random chance)
39
+ - **Environmental Impact:**
40
+ - Emissions (kgCO2eq)
41
+ - Energy Consumed (kWh)
42
+
43
+ ### Limitations
44
+ - Random predictions with no learning
45
+ - No consideration of input text
46
+ - Serves only as a baseline reference
47
+
48
+ ## Ethical Considerations
49
+ - Model makes random predictions and should not be used for actual classification
50
+ - Dataset contains sensitive topics related to climate disinformation
51
+ - Environmental impact is tracked to promote awareness of AI's carbon footprint
52
+
53
+ ## Environmental Impact
54
+ This model tracks its environmental impact using CodeCarbon, measuring:
55
+ - Carbon emissions
56
+ - Energy consumption
57
+
58
+ ## Caveats and Recommendations
59
+ - Use only as a baseline comparison
60
+ - Not suitable for production use
61
+ - Consider environmental impact when running experiments
requirements.txt CHANGED
@@ -1,2 +1,5 @@
1
  codecarbon==2.8.1
2
- scikit-learn==1.5.1
 
 
 
 
1
  codecarbon==2.8.1
2
+ scikit-learn==1.5.1
3
+ datasets==3.2.0
4
+ huggingface-hub==0.26.3
5
+ python-dotenv==1.0.1