class DQN(nn.Module):

    def __init__(self, img_height, img_width):
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(in_features=img_height * img_width * 3, out_features=12)
        self.fc2 = nn.Linear(in_features=12, out_features=8)
       # self.fc3 = nn.Linear(in_features=10, out_features=8)
        self.clayer_in = torch.nn.Linear(in_features=8, out_features=wires)
        self.clayer_out = torch.nn.Linear(wires, out_dim)

        dev = qml.device('strawberryfields.fock', wires=wires, cutoff_dim=3)
        self.layer_qnode = qml.QNode(layer, dev)

        weights = qml.init.cvqnn_layers_all(n_quantum_layers, wires)
        weight_shapes = {"w{}".format(i): w.shape for i, w in enumerate(weights)}
        self.qlayer = qml.qnn.TorchLayer(self.layer_qnode, weight_shapes)

    def forward(self, t):
        t = self.flatten(t)
        t = self.fc1(t)
        t = self.fc2(t)
       # t = self.fc3(t)
        t = self.clayer_in(t)
        t = self.qlayer(t)
        t = self.clayer_out(t)
        t = t.sigmoid()
        return t

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
em = CartPoleEnvManager(device)
strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay)
agent = Agent(strategy, em.num_actions_available(), device)
memory = ReplayMemory(memory_size)
#learning_rate = LearningRate(lr_start,lr_end,lr_decay)
#learn = lr(learning_rate)

policy_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device)
target_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device)
target_net.eval() #tells pytorch that target_net is only used for inference, not training
optimizer = optim.Adam(params=policy_net.parameters(), lr=0.01)

i = 0
episode_durations = []
for episode in range(num_episodes): #iterate over each episode
    program_starts = time.time()
    state = em.get_state()
    for timestep in count():
        action = agent.select_action(state, policy_net)
        reward = em.take_action(action)
        next_state = em.get_state()
        memory.push(Experience(state, action, next_state, reward))
        state = next_state
        if memory.can_provide_sample(batch_size):
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.9)
            experiences = memory.sample(batch_size)
            states, actions, rewards, next_states = extract_tensors(experiences)
            current_q_values = QValues.get_current(policy_net, states, actions)
            next_q_values = QValues.get_next(target_net, next_states) #will get the max qvalues of the next state, q values of next state are used via next state
            target_q_values = (next_q_values * gamma) + rewards

            loss = F.mse_loss(current_q_values, target_q_values.unsqueeze(1))
            optimizer.zero_grad() # sets the gradiesnt of all weights n biases in policy_net to zero
            loss.backward() #computes gradient of loss with respect to all weights n biases in the policy net
            optimizer.step() # updates the weights n biases with the gradients that were computed form loss.backwards
        if em.done:
            plot(episode_durations, 100)
    if episode % target_update == 0:
    now = time.time()
    print("Episode hat {0} Sekunden gedauert".format(now - program_starts))     


Traceback (most recent call last):
  File "qdqn.py", line 328, in <module>
    loss.backward() #computes gradient of loss with respect to all weights n biases in the policy net
  File "/home/ubuntu/anaconda3/envs/gymm/lib/python3.8/site-packages/torch/tensor.py", line 198, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/home/ubuntu/anaconda3/envs/gymm/lib/python3.8/site-packages/torch/autograd/__init__.py", line 98, in backward
RuntimeError: Expected object of device type cuda but got device type cpu for argument #2 'mat2' in call to _th_mm

