jbilcke-hf HF Staff commited on
Commit
e8518d0
·
1 Parent(s): 9cdaa70
vms/ui/project/services/training.py CHANGED
@@ -1664,25 +1664,25 @@ class TrainingService:
1664
  # Check in lora_weights directory
1665
  lora_weights_dir = self.app.output_path / "lora_weights"
1666
  if lora_weights_dir.exists():
1667
- logger.info(f"Found lora_weights directory: {lora_weights_dir}")
1668
 
1669
  # Look for the latest checkpoint directory in lora_weights
1670
  lora_checkpoints = [d for d in lora_weights_dir.glob("*") if d.is_dir() and d.name.isdigit()]
1671
  if lora_checkpoints:
1672
  latest_lora_checkpoint = max(lora_checkpoints, key=lambda x: int(x.name))
1673
- logger.info(f"Found latest LoRA checkpoint: {latest_lora_checkpoint}")
1674
 
1675
  # Extract step count from directory name
1676
  result["steps"] = int(latest_lora_checkpoint.name)
1677
 
1678
  # List contents of the latest checkpoint directory
1679
  checkpoint_contents = list(latest_lora_checkpoint.glob("*"))
1680
- logger.info(f"Contents of LoRA checkpoint {latest_lora_checkpoint.name}: {checkpoint_contents}")
1681
 
1682
  # Check for weights in the latest LoRA checkpoint
1683
  lora_safetensors = latest_lora_checkpoint / "pytorch_lora_weights.safetensors"
1684
  if lora_safetensors.exists():
1685
- logger.info(f"Found weights in latest LoRA checkpoint: {lora_safetensors}")
1686
  result["path"] = str(lora_safetensors)
1687
  return result
1688
 
@@ -1697,14 +1697,14 @@ class TrainingService:
1697
  for weight_file in possible_weight_files:
1698
  weight_path = latest_lora_checkpoint / weight_file
1699
  if weight_path.exists():
1700
- logger.info(f"Found weights file {weight_file} in latest LoRA checkpoint: {weight_path}")
1701
  result["path"] = str(weight_path)
1702
  return result
1703
 
1704
  # Check if any .safetensors files exist
1705
  safetensors_files = list(latest_lora_checkpoint.glob("*.safetensors"))
1706
  if safetensors_files:
1707
- logger.info(f"Found .safetensors files in LoRA checkpoint: {safetensors_files}")
1708
  # Return the first .safetensors file found
1709
  result["path"] = str(safetensors_files[0])
1710
  return result
@@ -1712,11 +1712,12 @@ class TrainingService:
1712
  # Fallback: check for direct safetensors file in lora_weights root
1713
  lora_safetensors = lora_weights_dir / "pytorch_lora_weights.safetensors"
1714
  if lora_safetensors.exists():
1715
- logger.info(f"Found weights in lora_weights directory: {lora_safetensors}")
1716
  result["path"] = str(lora_safetensors)
1717
  return result
1718
  else:
1719
  logger.info(f"pytorch_lora_weights.safetensors not found in lora_weights directory")
 
1720
 
1721
  # If not found in root or lora_weights, log the issue and check fallback
1722
  logger.warning(f"Model weights not found at expected location: {model_output_safetensors_path}")
 
1664
  # Check in lora_weights directory
1665
  lora_weights_dir = self.app.output_path / "lora_weights"
1666
  if lora_weights_dir.exists():
1667
+ #logger.info(f"Found lora_weights directory: {lora_weights_dir}")
1668
 
1669
  # Look for the latest checkpoint directory in lora_weights
1670
  lora_checkpoints = [d for d in lora_weights_dir.glob("*") if d.is_dir() and d.name.isdigit()]
1671
  if lora_checkpoints:
1672
  latest_lora_checkpoint = max(lora_checkpoints, key=lambda x: int(x.name))
1673
+ #logger.info(f"Found latest LoRA checkpoint: {latest_lora_checkpoint}")
1674
 
1675
  # Extract step count from directory name
1676
  result["steps"] = int(latest_lora_checkpoint.name)
1677
 
1678
  # List contents of the latest checkpoint directory
1679
  checkpoint_contents = list(latest_lora_checkpoint.glob("*"))
1680
+ #logger.info(f"Contents of LoRA checkpoint {latest_lora_checkpoint.name}: {checkpoint_contents}")
1681
 
1682
  # Check for weights in the latest LoRA checkpoint
1683
  lora_safetensors = latest_lora_checkpoint / "pytorch_lora_weights.safetensors"
1684
  if lora_safetensors.exists():
1685
+ #logger.info(f"Found weights in latest LoRA checkpoint: {lora_safetensors}")
1686
  result["path"] = str(lora_safetensors)
1687
  return result
1688
 
 
1697
  for weight_file in possible_weight_files:
1698
  weight_path = latest_lora_checkpoint / weight_file
1699
  if weight_path.exists():
1700
+ #logger.info(f"Found weights file {weight_file} in latest LoRA checkpoint: {weight_path}")
1701
  result["path"] = str(weight_path)
1702
  return result
1703
 
1704
  # Check if any .safetensors files exist
1705
  safetensors_files = list(latest_lora_checkpoint.glob("*.safetensors"))
1706
  if safetensors_files:
1707
+ #logger.info(f"Found .safetensors files in LoRA checkpoint: {safetensors_files}")
1708
  # Return the first .safetensors file found
1709
  result["path"] = str(safetensors_files[0])
1710
  return result
 
1712
  # Fallback: check for direct safetensors file in lora_weights root
1713
  lora_safetensors = lora_weights_dir / "pytorch_lora_weights.safetensors"
1714
  if lora_safetensors.exists():
1715
+ #logger.info(f"Found weights in lora_weights directory: {lora_safetensors}")
1716
  result["path"] = str(lora_safetensors)
1717
  return result
1718
  else:
1719
  logger.info(f"pytorch_lora_weights.safetensors not found in lora_weights directory")
1720
+ pass
1721
 
1722
  # If not found in root or lora_weights, log the issue and check fallback
1723
  logger.warning(f"Model weights not found at expected location: {model_output_safetensors_path}")