目前正在做基于DDPG算法的双有源全桥dcdc变换器的调制策略,目前程序可以运行,但是action不会根据reward的变化而变化
```c
% 打开模型
open_system('SPSDAB')
%设置观测信息
obsInfo = rlNumericSpec([3 1],...
'LowerLimit',[-inf -inf 0]',...
'UpperLimit',[inf inf inf]');
obsInfo.Name = 'observations';
obsInfo.Description = 'integrated error, error, and measured height';
numObservations = obsInfo.Dimension(1);
% 设置动作信息
actInfo = rlNumericSpec([1 1]);
actInfo.Name = 'Transfer ratio';
numActions = actInfo.Dimension(1);
% 创建环境对象
env = rlSimulinkEnv('SPSDAB','SPSDAB/RL Agent',...
obsInfo,actInfo);
env.ResetFcn = @(in)localResetFcn(in);
Ts = 1.0;
Tf = 200;
rng(0)
% 定义评论网络结构
statePath = [
featureInputLayer(numObservations,'Normalization','none','Name','State')
fullyConnectedLayer(64,'Name','CriticStateFC1')
reluLayer('Name','CriticRelu1')
fullyConnectedLayer(32,'Name','CriticStateFC2')];
actionPath = [
featureInputLayer(numActions,'Normalization','none','Name','Action')
fullyConnectedLayer(32,'Name','CriticActionFC1')];
commonPath = [
additionLayer(2,'Name','add')
reluLayer('Name','CriticCommonRelu')
fullyConnectedLayer(1,'Name','CriticOutput')];
criticNetwork = layerGraph();
criticNetwork = addLayers(criticNetwork,statePath);
criticNetwork = addLayers(criticNetwork,actionPath);
criticNetwork = addLayers(criticNetwork,commonPath);
criticNetwork = connectLayers(criticNetwork,'CriticStateFC2','add/in1');
criticNetwork = connectLayers(criticNetwork,'CriticActionFC1','add/in2');
% 定义评论网络选项
criticOpts = rlRepresentationOptions('LearnRate', 1e-03, 'GradientThreshold', 1);
critic = rlQValueRepresentation(criticNetwork, obsInfo, actInfo, 'Observation', {'State'}, 'Action', {'Action'}, criticOpts);
% 定义演员网络结构
actorNetwork = [
featureInputLayer(numObservations, 'Normalization', 'none', 'Name', 'State')
fullyConnectedLayer(64, 'Name', 'actorFC1')
reluLayer('Name', 'actorRelu1')
fullyConnectedLayer(32, 'Name', 'actorFC2')
reluLayer('Name', 'actorRelu2')
fullyConnectedLayer(numActions, 'Name', 'Action')
];
% 演员网络选项
actorOptions = rlRepresentationOptions('LearnRate', 1e-04, 'GradientThreshold', 1);
actor = rlDeterministicActorRepresentation(actorNetwork,obsInfo,actInfo,'Observation',{'State'},'Action',{'Action'},actorOptions);
% 定义代理对象选项
agentOpts = rlDDPGAgentOptions(...
'SampleTime',Ts,...
'TargetSmoothFactor',1e-3,...
'DiscountFactor',1.0, ...
'MiniBatchSize',128, ...
'ExperienceBufferLength',1e6);
agentOpts.NoiseOptions.Variance = 0.3;
agentOpts.NoiseOptions.VarianceDecayRate = 1e-5;
agent = rlDDPGAgent(actor, critic, agentOpts);
maxepisodes = 5000;
maxsteps = ceil(Tf/Ts);
% 训练选项
trainOpts = rlTrainingOptions(...
'MaxEpisodes', maxepisodes, ...
'MaxStepsPerEpisode', maxsteps, ...
'ScoreAveragingWindowLength', 20, ...
'Verbose', false, ...
'Plots', 'training-progress',...
'StopTrainingCriteria', 'AverageReward',...
'StopTrainingValue', 800);
doTraining = false;
if doTraining
% 训练代理
trainingStats = train(agent, env, trainOpts);
else
% 加载预训练代理
load('SPSDABDDPG.mat', 'agent')
end
```