feat(harvest): convert notebook to reusable service

thekaveman · thekaveman · commit 3b29d5dcc949 · 2024-04-25T00:09:39.000-07:00
diff --git a/compiler_admin/services/harvest.py b/compiler_admin/services/harvest.py
@@ -0,0 +1,91 @@
+from datetime import datetime, timedelta
+import os
+import sys
+from typing import TextIO
+
+import pandas as pd
+
+import compiler_admin.services.files as files
+
+# input CSV columns needed for conversion
+INPUT_COLUMNS = ["Date", "Client", "Project", "Notes", "Hours", "First Name", "Last Name"]
+
+# default output CSV columns
+OUTPUT_COLUMNS = ["Email", "Start date", "Start time", "Duration", "Project", "Task", "Client", "Billable", "Description"]
+
+
+def _calc_start_time(group: pd.DataFrame):
+    """Start time is offset by the previous record's duration, with a default of 0 offset for the first record."""
+    group["Start time"] = group["Start time"] + group["Duration"].shift(fill_value=pd.to_timedelta("00:00:00")).cumsum()
+    return group
+
+
+def _duration_str(duration: timedelta) -> str:
+    """Use total seconds to convert to a datetime and format as a string e.g. 01:30."""
+    return datetime.fromtimestamp(duration.total_seconds()).strftime("%H:%M")
+
+
+def _toggl_client_name():
+    """Gets the value of the TOGGL_CLIENT_NAME env var."""
+    return os.environ.get("TOGGL_CLIENT_NAME")
+
+
+def convert_to_toggl(
+    source_path: str | TextIO = sys.stdin,
+    output_path: str | TextIO = sys.stdout,
+    client_name: str = None,
+    output_cols: list[str] = OUTPUT_COLUMNS,
+):
+    """Convert Harvest formatted entries in source_path to equivalent Toggl formatted entries.
+
+    Args:
+        source_path: The path to a readable CSV file of Harvest time entries; or a readable buffer of the same.
+
+        output_cols (list[str]): A list of column names for the output
+
+        output_path: The path to a CSV file where Toggl time entries will be written; or a writeable buffer for the same.
+
+    Returns:
+        None. Either prints the resulting CSV data or writes to output_path.
+    """
+    if client_name is None:
+        client_name = _toggl_client_name()
+
+    # read CSV file, parsing dates
+    source = files.read_csv(source_path, usecols=INPUT_COLUMNS, parse_dates=["Date"], cache_dates=True)
+
+    # rename columns that can be imported as-is
+    source.rename(columns={"Project": "Task", "Notes": "Description", "Date": "Start date"}, inplace=True)
+
+    # update static calculated columns
+    source["Client"] = client_name
+    source["Project"] = client_name
+    source["Billable"] = "Yes"
+
+    # add the Email column
+    source["Email"] = source["First Name"].apply(lambda x: f"{x.lower()}@compiler.la")
+
+    # Convert numeric Hours to timedelta Duration
+    source["Duration"] = source["Hours"].apply(pd.to_timedelta, unit="hours")
+
+    # Default start time to 09:00
+    source["Start time"] = pd.to_timedelta("09:00:00")
+
+    user_days = (
+        source
+        # sort and group by email and date
+        .sort_values(["Email", "Start date"]).groupby(["Email", "Start date"], observed=False)
+        # calculate a start time within each group (excluding the groupby columns)
+        .apply(_calc_start_time, include_groups=False)
+    )
+
+    # convert timedeltas to duration strings
+    user_days["Duration"] = user_days["Duration"].apply(_duration_str)
+    user_days["Start time"] = user_days["Start time"].apply(_duration_str)
+
+    # re-sort by start date/time and user
+    # reset the index to get rid of the group multi index and fold the group columns back down
+    output_data = pd.DataFrame(data=user_days).reset_index()
+    output_data.sort_values(["Start date", "Start time", "Email"], inplace=True)
+
+    files.write_csv(output_path, output_data, output_cols)
diff --git a/tests/services/test_harvest.py b/tests/services/test_harvest.py
@@ -0,0 +1,111 @@
+import sys
+from datetime import timedelta
+from io import StringIO
+
+import numpy as np
+import pandas as pd
+import pytest
+
+import compiler_admin.services.harvest
+from compiler_admin.services.harvest import (
+    __name__ as MODULE,
+    files,
+    INPUT_COLUMNS,
+    OUTPUT_COLUMNS,
+    _calc_start_time,
+    _duration_str,
+    _toggl_client_name,
+    convert_to_toggl,
+)
+
+
+@pytest.fixture(autouse=True)
+def mock_environment(monkeypatch):
+    monkeypatch.setenv("TOGGL_CLIENT_NAME", "Test_Client")
+
+
+@pytest.fixture
+def spy_files(mocker):
+    return mocker.patch.object(compiler_admin.services.harvest, "files", wraps=files)
+
+
+@pytest.fixture
+def mock_toggl_client_name(mocker):
+    return mocker.patch(f"{MODULE}._toggl_client_name")
+
+
+def test_calc_start_time():
+    durations = pd.to_timedelta(np.arange(1, 6), unit="m")
+    df = pd.DataFrame(data={"Duration": durations, "Start time": [pd.to_timedelta("09:00:00") for d in durations]})
+
+    calc_df = _calc_start_time(df)
+
+    assert calc_df.columns.equals(df.columns)
+    assert calc_df["Duration"].equals(df["Duration"])
+    assert calc_df["Start time"].to_list() == [
+        # offset = 0, cumsum = 0
+        pd.to_timedelta("09:00:00"),
+        # offset = 1, cumsum = 1
+        pd.to_timedelta("09:01:00"),
+        # offset = 2, cumsum = 3
+        pd.to_timedelta("09:03:00"),
+        # offset = 3, cumsum = 6
+        pd.to_timedelta("09:06:00"),
+        # offset = 4, cumsum = 10
+        pd.to_timedelta("09:10:00"),
+    ]
+
+
+def test_duration_str():
+    td = timedelta(hours=1, minutes=30, seconds=15)
+
+    result = _duration_str(td)
+
+    assert isinstance(result, str)
+    assert result == "01:30"
+
+
+def test_toggl_client_name(monkeypatch):
+    assert _toggl_client_name() == "Test_Client"
+
+    monkeypatch.setenv("TOGGL_CLIENT_NAME", "New Test Client")
+
+    assert _toggl_client_name() == "New Test Client"
+
+
+def test_convert_to_toggl_mocked(harvest_file, spy_files, mock_toggl_client_name):
+    convert_to_toggl(harvest_file, client_name=None)
+
+    mock_toggl_client_name.assert_called_once()
+
+    spy_files.read_csv.assert_called_once()
+    call_args = spy_files.read_csv.call_args
+    assert (harvest_file,) in call_args
+    assert call_args.kwargs["usecols"] == INPUT_COLUMNS
+    assert call_args.kwargs["parse_dates"] == ["Date"]
+    assert call_args.kwargs["cache_dates"] is True
+
+    spy_files.write_csv.assert_called_once()
+    call_args = spy_files.write_csv.call_args
+    assert call_args[0][0] == sys.stdout
+    assert call_args[0][2] == OUTPUT_COLUMNS
+
+
+def test_convert_to_toggl_sample(harvest_file, toggl_file):
+    output = None
+
+    with StringIO() as output_data:
+        convert_to_toggl(harvest_file, output_data, "Test Client 123")
+        output = output_data.getvalue()
+
+    assert output
+    assert isinstance(output, str)
+    assert ",".join(OUTPUT_COLUMNS) in output
+
+    order = ["Start date", "Start time", "Email"]
+    sample_output_df = pd.read_csv(toggl_file).sort_values(order)
+    output_df = pd.read_csv(StringIO(output)).sort_values(order)
+
+    assert set(output_df.columns.to_list()) <= set(sample_output_df.columns.to_list())
+    assert output_df["Client"].eq("Test Client 123").all()
+    assert output_df["Project"].eq("Test Client 123").all()