Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Python
__pycache__/
*.pyc
*.pyo
*.pyd

# Environnements
.venv/
venv/
.env

# Git
.git
.gitignore

# Dossiers
orchestrateur/**
data/**
3 changes: 0 additions & 3 deletions infra/Dockerfile → Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,10 @@ RUN pip install uv

WORKDIR /app


COPY pyproject.toml uv.lock ./

RUN uv sync

COPY . .

ENV PYTHONPATH=/app

CMD ["uv", "run", "pipelines/run.py"]
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@ La pipeline crée plusieurs fichier:
## Label Studio (annotation)

```bash
docker compose -f infra/docker-compose.yml up
docker compose --env-file .env -f infra/docker-compose.yml up -d
docker build -t biolit-pipeline .
docker run --rm --network biolit_network --env-file .env biolit-pipeline uv run python -m pipelines.run
```

UI : http://localhost:8080
Expand Down
32 changes: 17 additions & 15 deletions biolit/create_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
import polars as pl
from sqlalchemy import create_engine, text
import pandas as pd
import structlog
from dotenv import load_dotenv
LOGGER = structlog.get_logger()
load_dotenv()


# -------------------------
# Connexion DB
# -------------------------
Expand All @@ -14,8 +15,7 @@ def get_engine():
postgres_url = os.getenv("POSTGRES_URL")

if not postgres_url:
raise ValueError("Missing POSTGRES_URL")

raise ValueError("Missing DATABASE_URL")
return create_engine(postgres_url)

# -------------------------
Expand Down Expand Up @@ -43,7 +43,8 @@ def create_table():
nom_scientifique TEXT,
nom_commun TEXT,
categorie_programme BIGINT,
programme TEXT
programme TEXT,
validee TEXT
);
"""))

Expand Down Expand Up @@ -148,7 +149,8 @@ def insert_dataframe(df: pl.DataFrame):
nom_scientifique,
nom_commun,
categorie_programme,
programme
programme,
validee
) VALUES (
:id_observation,
:date_observation,
Expand All @@ -166,7 +168,8 @@ def insert_dataframe(df: pl.DataFrame):
:nom_scientifique,
:nom_commun,
:categorie_programme,
:programme
:programme,
:validee
)
ON CONFLICT (id_observation) DO NOTHING
"""), row)
Expand Down Expand Up @@ -202,19 +205,18 @@ def insert_enriched_dataframe(df: pd.DataFrame, engine):
ON CONFLICT (id_observation) DO NOTHING
"""), row)










def load_observations_from_db(engine) -> pl.DataFrame:
query = """
SELECT *
FROM observations
"""

return pl.read_database(query, engine)

def load_observations_from_db_for_S3(engine) -> pl.DataFrame:
query = """
SELECT id_observation, photos, latitude, longitude
FROM observations
LIMIT 10
"""
return pl.read_database(query, engine)
2 changes: 2 additions & 0 deletions biolit/export_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
def fetch_biolit_from_api():

url = os.getenv("BIOLIT_API_URL")
LOGGER.info("BIOLIT_API_URL:", value=url)


response = requests.get(url)
Expand Down Expand Up @@ -58,6 +59,7 @@ def normalize_column_name(col: str) -> str:
"common": "nom_commun",
"categorie-programme": "categorie_programme",
"programme": "programme",
"validee": "validee",
}


Expand Down
4 changes: 3 additions & 1 deletion biolit/geoloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,9 @@ def carte_points_biolit_checks_geoloc(file_to_map: Path):
popup_text = (
f"ID: {row['id']}<br>"
f"Côtier: {row['is_coastal']}<br>"
f"Distance : {row['distance_to_coast']} m"
f"Distance : {row['distance_to_coast']} m<br>"
f"Code Postal : {row['code_postal']}<br>"
f"Departement : {row['dep_nom']} m<br>"
)

folium.CircleMarker(
Expand Down
77 changes: 77 additions & 0 deletions biolit/label_studio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import os
import polars as pl
from dotenv import load_dotenv
import structlog
from label_studio_sdk import LabelStudio

LOGGER = structlog.get_logger()
load_dotenv()


def push_tasks_label_studio(project_title: str, df: pl.DataFrame):
api_key = os.getenv("LABEL_STUDIO_API_KEY")
url = "http://label-studio:8080"

client = LabelStudio(base_url=url, api_key=api_key)
projects = client.projects.list()

project_id = None
for project in projects:
if project.title == project_title:
project_id = project.id
LOGGER.info(f"Projet ID={project.id}, Nom={project.title} exists")
break

if project_id is None:
LOGGER.info(f"The project {project_title} does not exist.")
return

tasks = []

for row in df.to_dicts():
url = row["photos"]
filename = url.split("/")[-1]
tasks.append({
"data": {
"image": f"s3://crops-data/{row['id_observation']}/{filename}",
"latitude": row["latitude"],
"longitude": row["longitude"]
}
})

# Import tasks
client.projects.import_tasks(
id=project_id,
request=tasks,
return_task_ids=True,
)
LOGGER.info("The tasks have been successfully imported ; number of tasks :", value=len(df))


def delete_tasks_label_studio(project_title: str):
api_key = os.getenv("LABEL_STUDIO_API_KEY")
url = "http://label-studio:8080"

client = LabelStudio(base_url=url, api_key=api_key)
projects = client.projects.list()
project_id = None

for project in projects:
if project.title == project_title:
project_id = project.id
LOGGER.info(f"Projet ID={project.id}, Nom={project.title} exists")
break

if project_id is None:
LOGGER.info(f"The project {project_title} does not exist.")
return

tasks = client.tasks.list(project=project.id)
if not tasks:
LOGGER.info("No tasks to delete.")
return

task_ids = [t.id for t in tasks]
for task_id in task_ids:
client.tasks.delete(task_id)
LOGGER.info(f"{len(task_ids)} tasks deleted from project {project.id}")
133 changes: 133 additions & 0 deletions biolit/minio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import requests
import polars as pl
import structlog
import os
from dotenv import load_dotenv
from minio import Minio
from io import BytesIO
import json

LOGGER = structlog.get_logger()
load_dotenv()

def _upload_photos_minio(df: pl.DataFrame):
access_key = os.getenv("MINIO_ROOT_USER")
secret_key = os.getenv("MINIO_ROOT_PASSWORD")

# Config MinIO
client = Minio(
"minio:9000",
access_key=access_key,
secret_key=secret_key,
secure=False
)
LOGGER.info("Connected to S3")

bucket_name = "crops-data"

if not client.bucket_exists(bucket_name):
client.make_bucket(bucket_name)
LOGGER.info(f"Bucket created: {bucket_name}")
else:
LOGGER.info(f"Bucket already exists: {bucket_name}")

for row in df.to_dicts():
id_obs = row["id_observation"]
url = row["photos"]

filename = url.split("/")[-1]
object_name = f"{id_obs}/{filename}"

try:
client.stat_object(bucket_name, object_name)
LOGGER.info(f"Skipped (already exists): {object_name}")
continue
except Exception:
pass

response = requests.get(url)
if response.status_code == 200:
data = BytesIO(response.content)
client.put_object(
bucket_name,
object_name,
data,
length=len(response.content),
content_type="image/jpeg"
)
LOGGER.info(f"Uploaded: {object_name}")
else:
LOGGER.warning(f"Failed to fetch: {url}")

def _get_label_studios_info_minio():
access_key = os.getenv("MINIO_ROOT_USER")
secret_key = os.getenv("MINIO_ROOT_PASSWORD")

# Config MinIO
client = Minio(
"minio:9000",
access_key=access_key,
secret_key=secret_key,
secure=False
)
LOGGER.info("Connected to S3")

bucket_name = "label-data"

if not client.bucket_exists(bucket_name):
client.make_bucket(bucket_name)
LOGGER.info(f"Bucket created: {bucket_name}")
else:
LOGGER.info(f"Bucket already exists: {bucket_name}")

objects = client.list_objects(bucket_name, recursive=True)
all_annotations = []

for obj in objects:
LOGGER.info(f"Reading object: {obj.object_name}")

response = client.get_object(bucket_name, obj.object_name)

try:
content = response.read().decode("utf-8")
data = json.loads(content)

all_annotations.append(data)

LOGGER.info(f"Loaded OK: {obj.object_name}")
except Exception as e:
LOGGER.warning(f"Not JSON or failed: {obj.object_name} - {e}")

finally:
response.close()
response.release_conn()

return all_annotations


def annotations_to_polars(all_annotations):
rows = []

for ann in all_annotations:
annotation_id = ann.get("id")

task = ann.get("task", {})
task_id = task.get("id")
image = task.get("data", {}).get("image")

for res in ann.get("result", []):
value = res.get("value", {})

row = {
"task_id": task_id,
"annotation_id": annotation_id,
"type": res.get("type"),
"from_name": res.get("from_name"),
"label": value.get("choices"),
"image": image,
}

rows.append(row)
df = pl.DataFrame(rows)
LOGGER.info(df)
return df
13 changes: 0 additions & 13 deletions infra/.dockerignore

This file was deleted.

4 changes: 2 additions & 2 deletions infra/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ services:
image: heartexlabs/label-studio:1.13.1
container_name: biolit-label-studio
ports:
- "8080:8080"
- "8089:8080"
environment:
LABEL_STUDIO_BASE_DATA_DIR: /label-studio/data
LABEL_STUDIO_LOCAL_FILES_SERVING_ENABLED: "true"
LABEL_STUDIO_LOCAL_FILES_DOCUMENT_ROOT: /label-studio/files
LABEL_STUDIO_HOST: http://localhost:8080
LABEL_STUDIO_HOST: http://localhost:8089
LABEL_STUDIO_INTERNAL_HOST: http://label-studio:8080
volumes:
- ../data/label-studio:/label-studio/data
Expand Down
Loading
Loading